comparison data_manager/kraken2_build_database.py @ 0:bd47b9f87d67 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 68cd9a8ae50c5dfe6b667062a5172010511bcaff-dirty"
author dave
date Tue, 01 Dec 2020 16:07:40 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:bd47b9f87d67
1 #!/usr/bin/env python
2
3 from __future__ import print_function
4
5 import argparse
6 import datetime
7 import errno
8 import json
9 import os
10 import shutil
11 import subprocess
12 import sys
13 import tarfile
14 from enum import Enum
15
16 try:
17 # Python3
18 from urllib.request import urlopen
19 except ImportError:
20 from urllib2 import urlopen
21
22
23 DATA_TABLE_NAME = "kraken2_databases"
24
25
26 class KrakenDatabaseTypes(Enum):
27 standard = 'standard'
28 minikraken = 'minikraken'
29 special = 'special'
30 custom = 'custom'
31
32 def __str__(self):
33 return self.value
34
35
36 class SpecialDatabaseTypes(Enum):
37 rdp = 'rdp'
38 greengenes = 'greengenes'
39 silva = 'silva'
40
41 def __str__(self):
42 return self.value
43
44
45 class Minikraken2Versions(Enum):
46 v1 = 'v1'
47 v2 = 'v2'
48
49 def __str__(self):
50 return self.value
51
52 class Minikraken2Releases(Enum):
53 March_2020 = 'March_2020'
54 April_2019 = 'April_2019'
55
56 def __str__(self):
57 return self.value
58
59
60 def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME):
61 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
62
63 database_value = "_".join([
64 now,
65 "standard",
66 "kmer-len", str(kraken2_args["kmer_len"]),
67 "minimizer-len", str(kraken2_args["minimizer_len"]),
68 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]),
69 "load-factor", str(kraken2_args["load_factor"]),
70 ])
71
72 database_name = " ".join([
73 "Standard",
74 "(Created:",
75 now + ",",
76 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",",
77 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",",
78 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")",
79 "load-factor", str(kraken2_args["load_factor"]),
80 ])
81
82 database_path = database_value
83
84 args = [
85 '--threads', str(kraken2_args["threads"]),
86 '--standard',
87 '--kmer-len', str(kraken2_args["kmer_len"]),
88 '--minimizer-len', str(kraken2_args["minimizer_len"]),
89 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
90 '--load-factor', str(kraken2_args["load_factor"]),
91 '--db', database_path
92 ]
93
94 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
95
96 if kraken2_args["clean"]:
97 args = [
98 '--threads', str(kraken2_args["threads"]),
99 '--clean',
100 '--db', database_path
101 ]
102
103 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
104
105 data_table_entry = {
106 'data_tables': {
107 data_table_name: [
108 {
109 "value": database_value,
110 "name": database_name,
111 "path": database_path,
112 }
113 ]
114 }
115 }
116
117 return data_table_entry
118
119
120 def kraken2_build_minikraken(minikraken2_version, minikraken2_release, target_directory, data_table_name=DATA_TABLE_NAME):
121
122 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
123
124 value_parts = [now, "minikraken2", minikraken2_release, "8GB"]
125 name_parts = ["Minikraken2", minikraken2_release, "8GB", "(Created: %s)" % now]
126
127 if minikraken2_release == 'April_2019':
128 value_parts.insert(3, minikraken2_version)
129 name_parts.insert(2, minikraken2_version)
130 src = urlopen(
131 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_%s_8GB_201904.tgz'
132 % minikraken2_version
133 )
134 else:
135 src = urlopen('ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz')
136
137 database_value = "_".join(value_parts)
138
139 database_name = " ".join(name_parts)
140
141 database_path = database_value
142
143 # download the minikraken2 data
144 with open('tmp_data.tar.gz', 'wb') as dst:
145 shutil.copyfileobj(src, dst)
146 # unpack the downloaded archive to the target directory
147 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh:
148 for member in fh.getmembers():
149 if member.isreg():
150 member.name = os.path.basename(member.name)
151 fh.extract(member, os.path.join(target_directory, database_path))
152
153 data_table_entry = {
154 'data_tables': {
155 data_table_name: [
156 {
157 "value": database_value,
158 "name": database_name,
159 "path": database_path,
160 }
161 ]
162 }
163 }
164
165 return data_table_entry
166
167
168 def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME):
169
170 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
171
172 special_database_names = {
173 "rdp": "RDP",
174 "greengenes": "Greengenes",
175 "silva": "Silva",
176 }
177
178 database_value = "_".join([
179 now,
180 kraken2_args["special_database_type"],
181 "kmer-len", str(kraken2_args["kmer_len"]),
182 "minimizer-len", str(kraken2_args["minimizer_len"]),
183 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]),
184 "load-factor", str(kraken2_args["load_factor"]),
185 ])
186
187 database_name = " ".join([
188 special_database_names[kraken2_args["special_database_type"]],
189 "(Created:",
190 now + ",",
191 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",",
192 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",",
193 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")",
194 "load-factor=" + str(kraken2_args["load_factor"]) + ")",
195 ])
196
197 database_path = database_value
198
199 args = [
200 '--threads', str(kraken2_args["threads"]),
201 '--special', kraken2_args["special_database_type"],
202 '--kmer-len', str(kraken2_args["kmer_len"]),
203 '--minimizer-len', str(kraken2_args["minimizer_len"]),
204 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
205 '--load-factor', str(kraken2_args["load_factor"]),
206 '--db', database_path
207 ]
208
209 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
210
211 if kraken2_args["clean"]:
212 args = [
213 '--threads', str(kraken2_args["threads"]),
214 '--clean',
215 '--db', database_path
216 ]
217
218 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
219
220 data_table_entry = {
221 'data_tables': {
222 data_table_name: [
223 {
224 "value": database_value,
225 "name": database_name,
226 "path": database_path,
227 }
228 ]
229 }
230 }
231
232 return data_table_entry
233
234
235 def kraken2_build_custom(kraken2_args, custom_database_name, target_directory, data_table_name=DATA_TABLE_NAME):
236
237 args = [
238 '--threads', str(kraken2_args["threads"]),
239 '--download-taxonomy',
240 '--db', custom_database_name,
241 ]
242
243 if kraken2_args['skip_maps']:
244 args.append('--skip-maps')
245
246 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
247
248 args = [
249 '--threads', str(kraken2_args["threads"]),
250 '--add-to-library', kraken2_args["custom_fasta"],
251 '--db', custom_database_name
252 ]
253
254 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
255
256 args = [
257 '--threads', str(kraken2_args["threads"]),
258 '--build',
259 '--kmer-len', str(kraken2_args["kmer_len"]),
260 '--minimizer-len', str(kraken2_args["minimizer_len"]),
261 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
262 '--load-factor', str(kraken2_args["load_factor"]),
263 '--db', custom_database_name
264 ]
265
266 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
267
268 if kraken2_args["clean"]:
269 args = [
270 '--threads', str(kraken2_args["threads"]),
271 '--clean',
272 '--db', custom_database_name
273 ]
274
275 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
276
277 data_table_entry = {
278 'data_tables': {
279 data_table_name: [
280 {
281 "value": custom_database_name,
282 "name": custom_database_name,
283 "path": custom_database_name
284 }
285 ]
286 }
287 }
288
289 return data_table_entry
290
291
292 def main():
293 parser = argparse.ArgumentParser()
294 parser.add_argument('data_manager_json')
295 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length')
296 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length')
297 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces')
298 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor')
299 parser.add_argument('--threads', dest='threads', default=1, help='threads')
300 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build')
301 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken) and the Mar2019 release')
302 parser.add_argument('--minikraken2-release', dest='minikraken2_release', type=Minikraken2Releases, choices=list(Minikraken2Releases), help='MiniKraken2 release (only applies to --database-type minikraken)')
303 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)')
304 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)')
305 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)')
306 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='')
307 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files')
308 args = parser.parse_args()
309
310 with open(args.data_manager_json) as fh:
311 data_manager_input = json.load(fh)
312
313 target_directory = data_manager_input['output_data'][0]['extra_files_path']
314
315 try:
316 os.mkdir(target_directory)
317 except OSError as exc:
318 if exc.errno == errno.EEXIST and os.path.isdir(target_directory):
319 pass
320 else:
321 raise
322
323 data_manager_output = {}
324
325 if str(args.database_type) == 'standard':
326 kraken2_args = {
327 "kmer_len": args.kmer_len,
328 "minimizer_len": args.minimizer_len,
329 "minimizer_spaces": args.minimizer_spaces,
330 "load_factor": args.load_factor,
331 "threads": args.threads,
332 "clean": args.clean,
333 }
334 data_manager_output = kraken2_build_standard(
335 kraken2_args,
336 target_directory,
337 )
338 elif str(args.database_type) == 'minikraken':
339 data_manager_output = kraken2_build_minikraken(
340 str(args.minikraken2_version),
341 str(args.minikraken2_release),
342 target_directory
343 )
344 elif str(args.database_type) == 'special':
345 kraken2_args = {
346 "special_database_type": str(args.special_database_type),
347 "kmer_len": args.kmer_len,
348 "minimizer_len": args.minimizer_len,
349 "minimizer_spaces": args.minimizer_spaces,
350 "load_factor": args.load_factor,
351 "threads": args.threads,
352 "clean": args.clean,
353 }
354 data_manager_output = kraken2_build_special(
355 kraken2_args,
356 target_directory,
357 )
358 elif str(args.database_type) == 'custom':
359 kraken2_args = {
360 "custom_fasta": args.custom_fasta,
361 "skip_maps": args.skip_maps,
362 "kmer_len": args.kmer_len,
363 "minimizer_len": args.minimizer_len,
364 "minimizer_spaces": args.minimizer_spaces,
365 "load_factor": args.load_factor,
366 "threads": args.threads,
367 "clean": args.clean,
368 }
369 data_manager_output = kraken2_build_custom(
370 kraken2_args,
371 args.custom_database_name,
372 target_directory,
373 )
374 else:
375 sys.exit("Invalid database type")
376
377 with open(args.data_manager_json, 'w') as fh:
378 json.dump(data_manager_output, fh, sort_keys=True)
379
380
381 if __name__ == "__main__":
382 main()