Mercurial > repos > dave > data_manager_build_kraken2_database
comparison data_manager/kraken2_build_database.py @ 0:bd47b9f87d67 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 68cd9a8ae50c5dfe6b667062a5172010511bcaff-dirty"
author | dave |
---|---|
date | Tue, 01 Dec 2020 16:07:40 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bd47b9f87d67 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 from __future__ import print_function | |
4 | |
5 import argparse | |
6 import datetime | |
7 import errno | |
8 import json | |
9 import os | |
10 import shutil | |
11 import subprocess | |
12 import sys | |
13 import tarfile | |
14 from enum import Enum | |
15 | |
16 try: | |
17 # Python3 | |
18 from urllib.request import urlopen | |
19 except ImportError: | |
20 from urllib2 import urlopen | |
21 | |
22 | |
23 DATA_TABLE_NAME = "kraken2_databases" | |
24 | |
25 | |
26 class KrakenDatabaseTypes(Enum): | |
27 standard = 'standard' | |
28 minikraken = 'minikraken' | |
29 special = 'special' | |
30 custom = 'custom' | |
31 | |
32 def __str__(self): | |
33 return self.value | |
34 | |
35 | |
36 class SpecialDatabaseTypes(Enum): | |
37 rdp = 'rdp' | |
38 greengenes = 'greengenes' | |
39 silva = 'silva' | |
40 | |
41 def __str__(self): | |
42 return self.value | |
43 | |
44 | |
45 class Minikraken2Versions(Enum): | |
46 v1 = 'v1' | |
47 v2 = 'v2' | |
48 | |
49 def __str__(self): | |
50 return self.value | |
51 | |
52 class Minikraken2Releases(Enum): | |
53 March_2020 = 'March_2020' | |
54 April_2019 = 'April_2019' | |
55 | |
56 def __str__(self): | |
57 return self.value | |
58 | |
59 | |
60 def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
61 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
62 | |
63 database_value = "_".join([ | |
64 now, | |
65 "standard", | |
66 "kmer-len", str(kraken2_args["kmer_len"]), | |
67 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
68 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
69 "load-factor", str(kraken2_args["load_factor"]), | |
70 ]) | |
71 | |
72 database_name = " ".join([ | |
73 "Standard", | |
74 "(Created:", | |
75 now + ",", | |
76 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
77 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
78 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
79 "load-factor", str(kraken2_args["load_factor"]), | |
80 ]) | |
81 | |
82 database_path = database_value | |
83 | |
84 args = [ | |
85 '--threads', str(kraken2_args["threads"]), | |
86 '--standard', | |
87 '--kmer-len', str(kraken2_args["kmer_len"]), | |
88 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
89 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
90 '--load-factor', str(kraken2_args["load_factor"]), | |
91 '--db', database_path | |
92 ] | |
93 | |
94 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
95 | |
96 if kraken2_args["clean"]: | |
97 args = [ | |
98 '--threads', str(kraken2_args["threads"]), | |
99 '--clean', | |
100 '--db', database_path | |
101 ] | |
102 | |
103 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
104 | |
105 data_table_entry = { | |
106 'data_tables': { | |
107 data_table_name: [ | |
108 { | |
109 "value": database_value, | |
110 "name": database_name, | |
111 "path": database_path, | |
112 } | |
113 ] | |
114 } | |
115 } | |
116 | |
117 return data_table_entry | |
118 | |
119 | |
120 def kraken2_build_minikraken(minikraken2_version, minikraken2_release, target_directory, data_table_name=DATA_TABLE_NAME): | |
121 | |
122 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
123 | |
124 value_parts = [now, "minikraken2", minikraken2_release, "8GB"] | |
125 name_parts = ["Minikraken2", minikraken2_release, "8GB", "(Created: %s)" % now] | |
126 | |
127 if minikraken2_release == 'April_2019': | |
128 value_parts.insert(3, minikraken2_version) | |
129 name_parts.insert(2, minikraken2_version) | |
130 src = urlopen( | |
131 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_%s_8GB_201904.tgz' | |
132 % minikraken2_version | |
133 ) | |
134 else: | |
135 src = urlopen('ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz') | |
136 | |
137 database_value = "_".join(value_parts) | |
138 | |
139 database_name = " ".join(name_parts) | |
140 | |
141 database_path = database_value | |
142 | |
143 # download the minikraken2 data | |
144 with open('tmp_data.tar.gz', 'wb') as dst: | |
145 shutil.copyfileobj(src, dst) | |
146 # unpack the downloaded archive to the target directory | |
147 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
148 for member in fh.getmembers(): | |
149 if member.isreg(): | |
150 member.name = os.path.basename(member.name) | |
151 fh.extract(member, os.path.join(target_directory, database_path)) | |
152 | |
153 data_table_entry = { | |
154 'data_tables': { | |
155 data_table_name: [ | |
156 { | |
157 "value": database_value, | |
158 "name": database_name, | |
159 "path": database_path, | |
160 } | |
161 ] | |
162 } | |
163 } | |
164 | |
165 return data_table_entry | |
166 | |
167 | |
168 def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
169 | |
170 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
171 | |
172 special_database_names = { | |
173 "rdp": "RDP", | |
174 "greengenes": "Greengenes", | |
175 "silva": "Silva", | |
176 } | |
177 | |
178 database_value = "_".join([ | |
179 now, | |
180 kraken2_args["special_database_type"], | |
181 "kmer-len", str(kraken2_args["kmer_len"]), | |
182 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
183 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
184 "load-factor", str(kraken2_args["load_factor"]), | |
185 ]) | |
186 | |
187 database_name = " ".join([ | |
188 special_database_names[kraken2_args["special_database_type"]], | |
189 "(Created:", | |
190 now + ",", | |
191 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
192 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
193 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
194 "load-factor=" + str(kraken2_args["load_factor"]) + ")", | |
195 ]) | |
196 | |
197 database_path = database_value | |
198 | |
199 args = [ | |
200 '--threads', str(kraken2_args["threads"]), | |
201 '--special', kraken2_args["special_database_type"], | |
202 '--kmer-len', str(kraken2_args["kmer_len"]), | |
203 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
204 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
205 '--load-factor', str(kraken2_args["load_factor"]), | |
206 '--db', database_path | |
207 ] | |
208 | |
209 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
210 | |
211 if kraken2_args["clean"]: | |
212 args = [ | |
213 '--threads', str(kraken2_args["threads"]), | |
214 '--clean', | |
215 '--db', database_path | |
216 ] | |
217 | |
218 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
219 | |
220 data_table_entry = { | |
221 'data_tables': { | |
222 data_table_name: [ | |
223 { | |
224 "value": database_value, | |
225 "name": database_name, | |
226 "path": database_path, | |
227 } | |
228 ] | |
229 } | |
230 } | |
231 | |
232 return data_table_entry | |
233 | |
234 | |
235 def kraken2_build_custom(kraken2_args, custom_database_name, target_directory, data_table_name=DATA_TABLE_NAME): | |
236 | |
237 args = [ | |
238 '--threads', str(kraken2_args["threads"]), | |
239 '--download-taxonomy', | |
240 '--db', custom_database_name, | |
241 ] | |
242 | |
243 if kraken2_args['skip_maps']: | |
244 args.append('--skip-maps') | |
245 | |
246 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
247 | |
248 args = [ | |
249 '--threads', str(kraken2_args["threads"]), | |
250 '--add-to-library', kraken2_args["custom_fasta"], | |
251 '--db', custom_database_name | |
252 ] | |
253 | |
254 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
255 | |
256 args = [ | |
257 '--threads', str(kraken2_args["threads"]), | |
258 '--build', | |
259 '--kmer-len', str(kraken2_args["kmer_len"]), | |
260 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
261 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
262 '--load-factor', str(kraken2_args["load_factor"]), | |
263 '--db', custom_database_name | |
264 ] | |
265 | |
266 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
267 | |
268 if kraken2_args["clean"]: | |
269 args = [ | |
270 '--threads', str(kraken2_args["threads"]), | |
271 '--clean', | |
272 '--db', custom_database_name | |
273 ] | |
274 | |
275 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
276 | |
277 data_table_entry = { | |
278 'data_tables': { | |
279 data_table_name: [ | |
280 { | |
281 "value": custom_database_name, | |
282 "name": custom_database_name, | |
283 "path": custom_database_name | |
284 } | |
285 ] | |
286 } | |
287 } | |
288 | |
289 return data_table_entry | |
290 | |
291 | |
292 def main(): | |
293 parser = argparse.ArgumentParser() | |
294 parser.add_argument('data_manager_json') | |
295 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length') | |
296 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length') | |
297 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') | |
298 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') | |
299 parser.add_argument('--threads', dest='threads', default=1, help='threads') | |
300 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') | |
301 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken) and the Mar2019 release') | |
302 parser.add_argument('--minikraken2-release', dest='minikraken2_release', type=Minikraken2Releases, choices=list(Minikraken2Releases), help='MiniKraken2 release (only applies to --database-type minikraken)') | |
303 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') | |
304 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') | |
305 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') | |
306 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') | |
307 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') | |
308 args = parser.parse_args() | |
309 | |
310 with open(args.data_manager_json) as fh: | |
311 data_manager_input = json.load(fh) | |
312 | |
313 target_directory = data_manager_input['output_data'][0]['extra_files_path'] | |
314 | |
315 try: | |
316 os.mkdir(target_directory) | |
317 except OSError as exc: | |
318 if exc.errno == errno.EEXIST and os.path.isdir(target_directory): | |
319 pass | |
320 else: | |
321 raise | |
322 | |
323 data_manager_output = {} | |
324 | |
325 if str(args.database_type) == 'standard': | |
326 kraken2_args = { | |
327 "kmer_len": args.kmer_len, | |
328 "minimizer_len": args.minimizer_len, | |
329 "minimizer_spaces": args.minimizer_spaces, | |
330 "load_factor": args.load_factor, | |
331 "threads": args.threads, | |
332 "clean": args.clean, | |
333 } | |
334 data_manager_output = kraken2_build_standard( | |
335 kraken2_args, | |
336 target_directory, | |
337 ) | |
338 elif str(args.database_type) == 'minikraken': | |
339 data_manager_output = kraken2_build_minikraken( | |
340 str(args.minikraken2_version), | |
341 str(args.minikraken2_release), | |
342 target_directory | |
343 ) | |
344 elif str(args.database_type) == 'special': | |
345 kraken2_args = { | |
346 "special_database_type": str(args.special_database_type), | |
347 "kmer_len": args.kmer_len, | |
348 "minimizer_len": args.minimizer_len, | |
349 "minimizer_spaces": args.minimizer_spaces, | |
350 "load_factor": args.load_factor, | |
351 "threads": args.threads, | |
352 "clean": args.clean, | |
353 } | |
354 data_manager_output = kraken2_build_special( | |
355 kraken2_args, | |
356 target_directory, | |
357 ) | |
358 elif str(args.database_type) == 'custom': | |
359 kraken2_args = { | |
360 "custom_fasta": args.custom_fasta, | |
361 "skip_maps": args.skip_maps, | |
362 "kmer_len": args.kmer_len, | |
363 "minimizer_len": args.minimizer_len, | |
364 "minimizer_spaces": args.minimizer_spaces, | |
365 "load_factor": args.load_factor, | |
366 "threads": args.threads, | |
367 "clean": args.clean, | |
368 } | |
369 data_manager_output = kraken2_build_custom( | |
370 kraken2_args, | |
371 args.custom_database_name, | |
372 target_directory, | |
373 ) | |
374 else: | |
375 sys.exit("Invalid database type") | |
376 | |
377 with open(args.data_manager_json, 'w') as fh: | |
378 json.dump(data_manager_output, fh, sort_keys=True) | |
379 | |
380 | |
381 if __name__ == "__main__": | |
382 main() |