Mercurial > repos > dfornika > data_manager_build_kraken2_database
comparison data_manager/kraken2_build_database.py @ 17:4c9f9d6098eb draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit db9eb9472b9f6caea6a0755697ee8d3a93e85b5b-dirty
author | dfornika |
---|---|
date | Mon, 06 May 2019 19:42:14 -0400 |
parents | |
children | f005b6efd096 |
comparison
equal
deleted
inserted
replaced
16:d9f190088154 | 17:4c9f9d6098eb |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 from __future__ import print_function | |
4 | |
5 import argparse | |
6 import datetime | |
7 import errno | |
8 import json | |
9 import os | |
10 import shutil | |
11 import subprocess | |
12 import sys | |
13 import tarfile | |
14 from enum import Enum | |
15 | |
16 try: | |
17 # Python3 | |
18 from urllib.request import urlopen | |
19 except ImportError: | |
20 from urllib2 import urlopen | |
21 | |
22 | |
23 DATA_TABLE_NAME = "kraken2_databases" | |
24 | |
25 | |
26 class KrakenDatabaseTypes(Enum): | |
27 standard = 'standard' | |
28 minikraken = 'minikraken' | |
29 special = 'special' | |
30 custom = 'custom' | |
31 | |
32 def __str__(self): | |
33 return self.value | |
34 | |
35 | |
36 class SpecialDatabaseTypes(Enum): | |
37 rdp = 'rdp' | |
38 greengenes = 'greengenes' | |
39 silva = 'silva' | |
40 | |
41 def __str__(self): | |
42 return self.value | |
43 | |
44 | |
45 class Minikraken2Versions(Enum): | |
46 v1 = 'v1' | |
47 v2 = 'v2' | |
48 | |
49 def __str__(self): | |
50 return self.value | |
51 | |
52 | |
53 def kraken2_build_standard(data_manager_dict, kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
54 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
55 | |
56 database_value = "_".join([ | |
57 now, | |
58 "standard", | |
59 "kmer-len", str(kraken2_args["kmer_len"]), | |
60 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
61 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
62 ]) | |
63 | |
64 database_name = " ".join([ | |
65 "Standard", | |
66 "(Created:", | |
67 now + ",", | |
68 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
69 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
70 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
71 ]) | |
72 | |
73 database_path = database_value | |
74 | |
75 args = [ | |
76 '--threads', str(kraken2_args["threads"]), | |
77 '--standard', | |
78 '--kmer-len', str(kraken2_args["kmer_len"]), | |
79 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
80 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
81 '--db', database_path | |
82 ] | |
83 | |
84 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
85 | |
86 args = [ | |
87 '--threads', str(kraken2_args["threads"]), | |
88 '--clean', | |
89 '--db', database_path | |
90 ] | |
91 | |
92 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
93 | |
94 data_table_entry = { | |
95 "value": database_value, | |
96 "name": database_name, | |
97 "path": database_path, | |
98 } | |
99 | |
100 _add_data_table_entry(data_manager_dict, data_table_entry) | |
101 | |
102 | |
103 def kraken2_build_minikraken(data_manager_dict, minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): | |
104 | |
105 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
106 | |
107 database_value = "_".join([ | |
108 now, | |
109 "minikraken2", | |
110 minikraken2_version, | |
111 "8GB", | |
112 ]) | |
113 | |
114 database_name = " ".join([ | |
115 "Minikraken2", | |
116 minikraken2_version, | |
117 "(Created:", | |
118 now + ")" | |
119 ]) | |
120 | |
121 # download the minikraken2 data | |
122 src = urlopen( | |
123 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken2_%s_8GB_201904_UPDATE.tgz' | |
124 % minikraken2_version | |
125 ) | |
126 with open('tmp_data.tar.gz', 'wb') as dst: | |
127 shutil.copyfileobj(src, dst) | |
128 # unpack the downloaded archive to the target directory | |
129 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
130 fh.extractall(target_directory) | |
131 | |
132 data_table_entry = { | |
133 "value": database_value, | |
134 "name": database_name, | |
135 "path": database_value, | |
136 } | |
137 | |
138 _add_data_table_entry(data_manager_dict, data_table_entry) | |
139 | |
140 | |
141 def kraken2_build_special(data_manager_dict, kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
142 | |
143 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
144 | |
145 special_database_names = { | |
146 "rdp": "RDP", | |
147 "greengenes": "Greengenes", | |
148 "silva": "Silva", | |
149 } | |
150 | |
151 database_value = "_".join([ | |
152 now, | |
153 kraken2_args["special_database_type"], | |
154 "kmer-len", str(kraken2_args["kmer_len"]), | |
155 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
156 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
157 ]) | |
158 | |
159 database_name = " ".join([ | |
160 special_database_names[kraken2_args["special_database_type"]], | |
161 "(Created:", | |
162 now + ",", | |
163 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
164 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
165 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
166 ]) | |
167 | |
168 database_path = database_value | |
169 | |
170 args = [ | |
171 '--threads', str(kraken2_args["threads"]), | |
172 '--special', kraken2_args["special_database_type"], | |
173 '--kmer-len', str(kraken2_args["kmer_len"]), | |
174 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
175 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
176 '--db', database_path | |
177 ] | |
178 | |
179 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
180 | |
181 args = [ | |
182 '--threads', str(kraken2_args["threads"]), | |
183 '--clean', | |
184 '--db', database_path | |
185 ] | |
186 | |
187 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
188 | |
189 data_table_entry = { | |
190 "value": database_value, | |
191 "name": database_name, | |
192 "path": database_path, | |
193 } | |
194 | |
195 _add_data_table_entry(data_manager_dict, data_table_entry) | |
196 | |
197 | |
198 def kraken2_build_custom(data_manager_dict, kraken2_args, custom_database_name, target_directory, data_table_name=DATA_TABLE_NAME): | |
199 | |
200 args = [ | |
201 '--threads', str(kraken2_args["threads"]), | |
202 '--download-taxonomy', | |
203 '--db', custom_database_name | |
204 ] | |
205 | |
206 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
207 | |
208 args = [ | |
209 '--threads', str(kraken2_args["threads"]), | |
210 '--add-to-library', kraken2_args["custom_fasta"], | |
211 '--db', custom_database_name | |
212 ] | |
213 | |
214 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
215 | |
216 args = [ | |
217 '--threads', str(kraken2_args["threads"]), | |
218 '--build', | |
219 '--kmer-len', str(kraken2_args["kmer_len"]), | |
220 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
221 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
222 '--db', custom_database_name | |
223 ] | |
224 | |
225 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
226 | |
227 args = [ | |
228 '--threads', str(kraken2_args["threads"]), | |
229 '--clean', | |
230 '--db', custom_database_name | |
231 ] | |
232 | |
233 subprocess.check_call(['kraken2-build'] + args, target_directory) | |
234 | |
235 data_table_entry = { | |
236 "value": custom_database_name, | |
237 "name": custom_database_name, | |
238 "path": custom_database_name | |
239 } | |
240 | |
241 _add_data_table_entry(data_manager_dict, data_table_name, data_table_entry) | |
242 | |
243 | |
244 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name=DATA_TABLE_NAME): | |
245 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | |
246 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] ) | |
247 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) | |
248 return data_manager_dict | |
249 | |
250 | |
251 def main(): | |
252 parser = argparse.ArgumentParser() | |
253 parser.add_argument('data_manager_json') | |
254 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length') | |
255 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length') | |
256 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') | |
257 parser.add_argument('--threads', dest='threads', default=1, help='threads') | |
258 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') | |
259 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') | |
260 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') | |
261 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') | |
262 parser.add_argument( '--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)' ) | |
263 args = parser.parse_args() | |
264 | |
265 data_manager_input = json.loads(open(args.data_manager_json).read()) | |
266 | |
267 target_directory = data_manager_input['output_data'][0]['extra_files_path'] | |
268 | |
269 try: | |
270 os.mkdir( target_directory ) | |
271 except OSError as exc: | |
272 if exc.errno == errno.EEXIST and os.path.isdir( target_directory ): | |
273 pass | |
274 else: | |
275 raise | |
276 | |
277 data_manager_output = {} | |
278 | |
279 print(args.database_type) | |
280 if str(args.database_type) == 'standard': | |
281 kraken2_args = { | |
282 "kmer_len": args.kmer_len, | |
283 "minimizer_len": args.minimizer_len, | |
284 "minimizer_spaces": args.minimizer_spaces, | |
285 "threads": args.threads, | |
286 } | |
287 kraken2_build_standard( | |
288 data_manager_output, | |
289 kraken2_args, | |
290 target_directory, | |
291 ) | |
292 elif str(args.database_type) == 'minikraken': | |
293 kraken2_build_minikraken( | |
294 data_manager_output, | |
295 str(args.minikraken2_version), | |
296 target_directory | |
297 ) | |
298 elif str(args.database_type) == 'special': | |
299 kraken2_args = { | |
300 "special_database_type": str(args.special_database_type), | |
301 "kmer_len": args.kmer_len, | |
302 "minimizer_len": args.minimizer_len, | |
303 "minimizer_spaces": args.minimizer_spaces, | |
304 "threads": args.threads, | |
305 } | |
306 kraken2_build_special( | |
307 data_manager_output, | |
308 kraken2_args, | |
309 target_directory, | |
310 ) | |
311 elif str(args.database_type) == 'custom': | |
312 kraken2_args = { | |
313 "custom_fasta": args.custom_fasta, | |
314 "kmer_len": args.kmer_len, | |
315 "minimizer_len": args.minimizer_len, | |
316 "minimizer_spaces": args.minimizer_spaces, | |
317 "threads": args.threads, | |
318 } | |
319 kraken2_build_custom( | |
320 data_manager_output, | |
321 kraken2_args, | |
322 args.custom_database_name, | |
323 target_directory, | |
324 ) | |
325 else: | |
326 sys.exit("Invalid database type") | |
327 | |
328 open(args.data_manager_json, 'w').write(json.dumps(data_manager_output)) | |
329 | |
330 | |
331 if __name__ == "__main__": | |
332 main() |