Mercurial > repos > iuc > data_manager_build_kraken2_database
comparison data_manager/kraken2_build_database.py @ 5:2f27f3b86827 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 648fe4911ce49173697f314d70e63e0de95b7e66"
author | iuc |
---|---|
date | Mon, 08 Nov 2021 15:40:34 +0000 |
parents | 0eebe086fd58 |
children | 9002633b4737 |
comparison
equal
deleted
inserted
replaced
4:0eebe086fd58 | 5:2f27f3b86827 |
---|---|
14 from enum import Enum | 14 from enum import Enum |
15 | 15 |
16 try: | 16 try: |
17 # Python3 | 17 # Python3 |
18 from urllib.request import urlopen | 18 from urllib.request import urlopen |
19 from urllib.error import URLError | |
19 except ImportError: | 20 except ImportError: |
20 from urllib2 import urlopen | 21 from urllib2 import urlopen |
22 from urllib2 import URLError | |
21 | 23 |
22 | 24 |
23 DATA_TABLE_NAME = "kraken2_databases" | 25 DATA_TABLE_NAME = "kraken2_databases" |
24 | 26 |
25 | 27 |
26 class KrakenDatabaseTypes(Enum): | 28 class KrakenDatabaseTypes(Enum): |
27 standard = 'standard' | 29 standard_local_build = 'standard_local_build' |
30 standard_prebuilt = 'standard_prebuilt' | |
28 minikraken = 'minikraken' | 31 minikraken = 'minikraken' |
29 special = 'special' | 32 special = 'special' |
30 custom = 'custom' | 33 custom = 'custom' |
31 | 34 |
32 def __str__(self): | 35 def __str__(self): |
43 | 46 |
44 | 47 |
45 class Minikraken2Versions(Enum): | 48 class Minikraken2Versions(Enum): |
46 v1 = 'v1' | 49 v1 = 'v1' |
47 v2 = 'v2' | 50 v2 = 'v2' |
51 | |
52 def __str__(self): | |
53 return self.value | |
54 | |
55 | |
56 class StandardPrebuiltSizes(Enum): | |
57 full = 'full' | |
58 gb_16 = '16' | |
59 gb_8 = '8' | |
48 | 60 |
49 def __str__(self): | 61 def __str__(self): |
50 return self.value | 62 return self.value |
51 | 63 |
52 | 64 |
61 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | 73 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), |
62 "load-factor", str(kraken2_args["load_factor"]), | 74 "load-factor", str(kraken2_args["load_factor"]), |
63 ]) | 75 ]) |
64 | 76 |
65 database_name = " ".join([ | 77 database_name = " ".join([ |
66 "Standard", | 78 "Standard (Local Build)", |
67 "(Created:", | 79 "(Created:", |
68 now + ",", | 80 now + ",", |
69 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | 81 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", |
70 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | 82 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", |
71 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | 83 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", |
108 } | 120 } |
109 | 121 |
110 return data_table_entry | 122 return data_table_entry |
111 | 123 |
112 | 124 |
125 def kraken2_build_standard_prebuilt(standard_prebuilt_size, prebuilt_date, target_directory, data_table_name=DATA_TABLE_NAME): | |
126 | |
127 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
128 | |
129 database_value = "_".join([ | |
130 now, | |
131 "standard_prebuilt", | |
132 standard_prebuilt_size | |
133 ]) | |
134 | |
135 database_name = " ".join([ | |
136 "Standard (Prebuilt)", | |
137 standard_prebuilt_size, | |
138 "(Downloaded:", | |
139 now + ")" | |
140 ]) | |
141 | |
142 database_path = database_value | |
143 | |
144 size_to_url_str = { | |
145 'full': '', | |
146 '16': '_16gb', | |
147 '8': '_8gb', | |
148 } | |
149 # we may need to let the user choose the date when new DBs are posted. | |
150 date_url_str = prebuilt_date.replace('-', '') | |
151 standard_prebuilt_size_url = size_to_url_str[standard_prebuilt_size] | |
152 # download the pre-built database | |
153 try: | |
154 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard%s_%s.tar.gz' % (standard_prebuilt_size_url, date_url_str) | |
155 src = urlopen(download_url) | |
156 except URLError as e: | |
157 print('url: ' + download_url, file=sys.stderr) | |
158 print(e, file=sys.stderr) | |
159 exit(1) | |
160 | |
161 with open('tmp_data.tar.gz', 'wb') as dst: | |
162 shutil.copyfileobj(src, dst) | |
163 # unpack the downloaded archive to the target directory | |
164 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
165 for member in fh.getmembers(): | |
166 if member.isreg(): | |
167 member.name = os.path.basename(member.name) | |
168 fh.extract(member, os.path.join(target_directory, database_path)) | |
169 | |
170 data_table_entry = { | |
171 'data_tables': { | |
172 data_table_name: [ | |
173 { | |
174 "value": database_value, | |
175 "name": database_name, | |
176 "path": database_path, | |
177 } | |
178 ] | |
179 } | |
180 } | |
181 | |
182 return data_table_entry | |
183 | |
184 | |
113 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): | 185 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): |
114 | 186 |
115 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | 187 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") |
116 | 188 |
117 database_value = "_".join([ | 189 database_value = "_".join([ |
129 ]) | 201 ]) |
130 | 202 |
131 database_path = database_value | 203 database_path = database_value |
132 | 204 |
133 # download the minikraken2 data | 205 # download the minikraken2 data |
134 src = urlopen( | 206 try: |
135 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken2_%s_8GB_201904_UPDATE.tgz' | 207 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/minikraken2_%s_8GB_201904.tgz' % minikraken2_version |
136 % minikraken2_version | 208 src = urlopen(download_url) |
137 ) | 209 except URLError as e: |
210 print('url: ' + download_url, file=sys.stderr) | |
211 print(e, file=sys.stderr) | |
212 exit(1) | |
213 | |
138 with open('tmp_data.tar.gz', 'wb') as dst: | 214 with open('tmp_data.tar.gz', 'wb') as dst: |
139 shutil.copyfileobj(src, dst) | 215 shutil.copyfileobj(src, dst) |
140 # unpack the downloaded archive to the target directory | 216 # unpack the downloaded archive to the target directory |
141 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | 217 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: |
142 for member in fh.getmembers(): | 218 for member in fh.getmembers(): |
291 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') | 367 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') |
292 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') | 368 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') |
293 parser.add_argument('--threads', dest='threads', default=1, help='threads') | 369 parser.add_argument('--threads', dest='threads', default=1, help='threads') |
294 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') | 370 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') |
295 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') | 371 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') |
372 parser.add_argument('--standard-prebuilt-size', dest='standard_prebuilt_size', type=StandardPrebuiltSizes, choices=list(StandardPrebuiltSizes), help='Size of standard prebuilt database to download (only applies to --database-type standard_prebuilt. Options are: "8", "16", "full".)') | |
373 parser.add_argument('--prebuilt-date', dest='prebuilt_date', help='Database build date (YYYY-MM-DD). Only applies to --database-type standard_prebuilt.') | |
296 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') | 374 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') |
297 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') | 375 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') |
298 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') | 376 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') |
299 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') | 377 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') |
300 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') | 378 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') |
313 else: | 391 else: |
314 raise | 392 raise |
315 | 393 |
316 data_manager_output = {} | 394 data_manager_output = {} |
317 | 395 |
318 if str(args.database_type) == 'standard': | 396 if str(args.database_type) == 'standard_local_build': |
319 kraken2_args = { | 397 kraken2_args = { |
320 "kmer_len": args.kmer_len, | 398 "kmer_len": args.kmer_len, |
321 "minimizer_len": args.minimizer_len, | 399 "minimizer_len": args.minimizer_len, |
322 "minimizer_spaces": args.minimizer_spaces, | 400 "minimizer_spaces": args.minimizer_spaces, |
323 "load_factor": args.load_factor, | 401 "load_factor": args.load_factor, |
325 "clean": args.clean, | 403 "clean": args.clean, |
326 } | 404 } |
327 data_manager_output = kraken2_build_standard( | 405 data_manager_output = kraken2_build_standard( |
328 kraken2_args, | 406 kraken2_args, |
329 target_directory, | 407 target_directory, |
408 ) | |
409 elif str(args.database_type) == 'standard_prebuilt': | |
410 data_manager_output = kraken2_build_standard_prebuilt( | |
411 str(args.standard_prebuilt_size), | |
412 str(args.prebuilt_date), | |
413 target_directory | |
330 ) | 414 ) |
331 elif str(args.database_type) == 'minikraken': | 415 elif str(args.database_type) == 'minikraken': |
332 data_manager_output = kraken2_build_minikraken( | 416 data_manager_output = kraken2_build_minikraken( |
333 str(args.minikraken2_version), | 417 str(args.minikraken2_version), |
334 target_directory | 418 target_directory |