comparison data_manager/kraken2_build_database.py @ 5:2f27f3b86827 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 648fe4911ce49173697f314d70e63e0de95b7e66"
author iuc
date Mon, 08 Nov 2021 15:40:34 +0000
parents 0eebe086fd58
children 9002633b4737
comparison
equal deleted inserted replaced
4:0eebe086fd58 5:2f27f3b86827
14 from enum import Enum 14 from enum import Enum
15 15
16 try: 16 try:
17 # Python3 17 # Python3
18 from urllib.request import urlopen 18 from urllib.request import urlopen
19 from urllib.error import URLError
19 except ImportError: 20 except ImportError:
20 from urllib2 import urlopen 21 from urllib2 import urlopen
22 from urllib2 import URLError
21 23
22 24
23 DATA_TABLE_NAME = "kraken2_databases" 25 DATA_TABLE_NAME = "kraken2_databases"
24 26
25 27
26 class KrakenDatabaseTypes(Enum): 28 class KrakenDatabaseTypes(Enum):
27 standard = 'standard' 29 standard_local_build = 'standard_local_build'
30 standard_prebuilt = 'standard_prebuilt'
28 minikraken = 'minikraken' 31 minikraken = 'minikraken'
29 special = 'special' 32 special = 'special'
30 custom = 'custom' 33 custom = 'custom'
31 34
32 def __str__(self): 35 def __str__(self):
43 46
44 47
45 class Minikraken2Versions(Enum): 48 class Minikraken2Versions(Enum):
46 v1 = 'v1' 49 v1 = 'v1'
47 v2 = 'v2' 50 v2 = 'v2'
51
52 def __str__(self):
53 return self.value
54
55
56 class StandardPrebuiltSizes(Enum):
57 full = 'full'
58 gb_16 = '16'
59 gb_8 = '8'
48 60
49 def __str__(self): 61 def __str__(self):
50 return self.value 62 return self.value
51 63
52 64
61 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), 73 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]),
62 "load-factor", str(kraken2_args["load_factor"]), 74 "load-factor", str(kraken2_args["load_factor"]),
63 ]) 75 ])
64 76
65 database_name = " ".join([ 77 database_name = " ".join([
66 "Standard", 78 "Standard (Local Build)",
67 "(Created:", 79 "(Created:",
68 now + ",", 80 now + ",",
69 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", 81 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",",
70 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", 82 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",",
71 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", 83 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")",
108 } 120 }
109 121
110 return data_table_entry 122 return data_table_entry
111 123
112 124
125 def kraken2_build_standard_prebuilt(standard_prebuilt_size, prebuilt_date, target_directory, data_table_name=DATA_TABLE_NAME):
126
127 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
128
129 database_value = "_".join([
130 now,
131 "standard_prebuilt",
132 standard_prebuilt_size
133 ])
134
135 database_name = " ".join([
136 "Standard (Prebuilt)",
137 standard_prebuilt_size,
138 "(Downloaded:",
139 now + ")"
140 ])
141
142 database_path = database_value
143
144 size_to_url_str = {
145 'full': '',
146 '16': '_16gb',
147 '8': '_8gb',
148 }
149 # we may need to let the user choose the date when new DBs are posted.
150 date_url_str = prebuilt_date.replace('-', '')
151 standard_prebuilt_size_url = size_to_url_str[standard_prebuilt_size]
152 # download the pre-built database
153 try:
154 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard%s_%s.tar.gz' % (standard_prebuilt_size_url, date_url_str)
155 src = urlopen(download_url)
156 except URLError as e:
157 print('url: ' + download_url, file=sys.stderr)
158 print(e, file=sys.stderr)
159 exit(1)
160
161 with open('tmp_data.tar.gz', 'wb') as dst:
162 shutil.copyfileobj(src, dst)
163 # unpack the downloaded archive to the target directory
164 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh:
165 for member in fh.getmembers():
166 if member.isreg():
167 member.name = os.path.basename(member.name)
168 fh.extract(member, os.path.join(target_directory, database_path))
169
170 data_table_entry = {
171 'data_tables': {
172 data_table_name: [
173 {
174 "value": database_value,
175 "name": database_name,
176 "path": database_path,
177 }
178 ]
179 }
180 }
181
182 return data_table_entry
183
184
113 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): 185 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME):
114 186
115 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") 187 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
116 188
117 database_value = "_".join([ 189 database_value = "_".join([
129 ]) 201 ])
130 202
131 database_path = database_value 203 database_path = database_value
132 204
133 # download the minikraken2 data 205 # download the minikraken2 data
134 src = urlopen( 206 try:
135 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken2_%s_8GB_201904_UPDATE.tgz' 207 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/minikraken2_%s_8GB_201904.tgz' % minikraken2_version
136 % minikraken2_version 208 src = urlopen(download_url)
137 ) 209 except URLError as e:
210 print('url: ' + download_url, file=sys.stderr)
211 print(e, file=sys.stderr)
212 exit(1)
213
138 with open('tmp_data.tar.gz', 'wb') as dst: 214 with open('tmp_data.tar.gz', 'wb') as dst:
139 shutil.copyfileobj(src, dst) 215 shutil.copyfileobj(src, dst)
140 # unpack the downloaded archive to the target directory 216 # unpack the downloaded archive to the target directory
141 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: 217 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh:
142 for member in fh.getmembers(): 218 for member in fh.getmembers():
291 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') 367 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces')
292 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') 368 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor')
293 parser.add_argument('--threads', dest='threads', default=1, help='threads') 369 parser.add_argument('--threads', dest='threads', default=1, help='threads')
294 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') 370 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build')
295 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') 371 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)')
372 parser.add_argument('--standard-prebuilt-size', dest='standard_prebuilt_size', type=StandardPrebuiltSizes, choices=list(StandardPrebuiltSizes), help='Size of standard prebuilt database to download (only applies to --database-type standard_prebuilt. Options are: "8", "16", "full".)')
373 parser.add_argument('--prebuilt-date', dest='prebuilt_date', help='Database build date (YYYY-MM-DD). Only applies to --database-type standard_prebuilt.')
296 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') 374 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)')
297 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') 375 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)')
298 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') 376 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)')
299 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') 377 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='')
300 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') 378 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files')
313 else: 391 else:
314 raise 392 raise
315 393
316 data_manager_output = {} 394 data_manager_output = {}
317 395
318 if str(args.database_type) == 'standard': 396 if str(args.database_type) == 'standard_local_build':
319 kraken2_args = { 397 kraken2_args = {
320 "kmer_len": args.kmer_len, 398 "kmer_len": args.kmer_len,
321 "minimizer_len": args.minimizer_len, 399 "minimizer_len": args.minimizer_len,
322 "minimizer_spaces": args.minimizer_spaces, 400 "minimizer_spaces": args.minimizer_spaces,
323 "load_factor": args.load_factor, 401 "load_factor": args.load_factor,
325 "clean": args.clean, 403 "clean": args.clean,
326 } 404 }
327 data_manager_output = kraken2_build_standard( 405 data_manager_output = kraken2_build_standard(
328 kraken2_args, 406 kraken2_args,
329 target_directory, 407 target_directory,
408 )
409 elif str(args.database_type) == 'standard_prebuilt':
410 data_manager_output = kraken2_build_standard_prebuilt(
411 str(args.standard_prebuilt_size),
412 str(args.prebuilt_date),
413 target_directory
330 ) 414 )
331 elif str(args.database_type) == 'minikraken': 415 elif str(args.database_type) == 'minikraken':
332 data_manager_output = kraken2_build_minikraken( 416 data_manager_output = kraken2_build_minikraken(
333 str(args.minikraken2_version), 417 str(args.minikraken2_version),
334 target_directory 418 target_directory