Mercurial > repos > dave > data_manager_build_kraken2_database
comparison data_manager/kraken2_build_database.py @ 0:bd47b9f87d67 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 68cd9a8ae50c5dfe6b667062a5172010511bcaff-dirty"
| author | dave |
|---|---|
| date | Tue, 01 Dec 2020 16:07:40 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bd47b9f87d67 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 from __future__ import print_function | |
| 4 | |
| 5 import argparse | |
| 6 import datetime | |
| 7 import errno | |
| 8 import json | |
| 9 import os | |
| 10 import shutil | |
| 11 import subprocess | |
| 12 import sys | |
| 13 import tarfile | |
| 14 from enum import Enum | |
| 15 | |
| 16 try: | |
| 17 # Python3 | |
| 18 from urllib.request import urlopen | |
| 19 except ImportError: | |
| 20 from urllib2 import urlopen | |
| 21 | |
| 22 | |
| 23 DATA_TABLE_NAME = "kraken2_databases" | |
| 24 | |
| 25 | |
| 26 class KrakenDatabaseTypes(Enum): | |
| 27 standard = 'standard' | |
| 28 minikraken = 'minikraken' | |
| 29 special = 'special' | |
| 30 custom = 'custom' | |
| 31 | |
| 32 def __str__(self): | |
| 33 return self.value | |
| 34 | |
| 35 | |
| 36 class SpecialDatabaseTypes(Enum): | |
| 37 rdp = 'rdp' | |
| 38 greengenes = 'greengenes' | |
| 39 silva = 'silva' | |
| 40 | |
| 41 def __str__(self): | |
| 42 return self.value | |
| 43 | |
| 44 | |
| 45 class Minikraken2Versions(Enum): | |
| 46 v1 = 'v1' | |
| 47 v2 = 'v2' | |
| 48 | |
| 49 def __str__(self): | |
| 50 return self.value | |
| 51 | |
| 52 class Minikraken2Releases(Enum): | |
| 53 March_2020 = 'March_2020' | |
| 54 April_2019 = 'April_2019' | |
| 55 | |
| 56 def __str__(self): | |
| 57 return self.value | |
| 58 | |
| 59 | |
| 60 def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 61 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 62 | |
| 63 database_value = "_".join([ | |
| 64 now, | |
| 65 "standard", | |
| 66 "kmer-len", str(kraken2_args["kmer_len"]), | |
| 67 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
| 68 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
| 69 "load-factor", str(kraken2_args["load_factor"]), | |
| 70 ]) | |
| 71 | |
| 72 database_name = " ".join([ | |
| 73 "Standard", | |
| 74 "(Created:", | |
| 75 now + ",", | |
| 76 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
| 77 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
| 78 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
| 79 "load-factor", str(kraken2_args["load_factor"]), | |
| 80 ]) | |
| 81 | |
| 82 database_path = database_value | |
| 83 | |
| 84 args = [ | |
| 85 '--threads', str(kraken2_args["threads"]), | |
| 86 '--standard', | |
| 87 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 88 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 89 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 90 '--load-factor', str(kraken2_args["load_factor"]), | |
| 91 '--db', database_path | |
| 92 ] | |
| 93 | |
| 94 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 95 | |
| 96 if kraken2_args["clean"]: | |
| 97 args = [ | |
| 98 '--threads', str(kraken2_args["threads"]), | |
| 99 '--clean', | |
| 100 '--db', database_path | |
| 101 ] | |
| 102 | |
| 103 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 104 | |
| 105 data_table_entry = { | |
| 106 'data_tables': { | |
| 107 data_table_name: [ | |
| 108 { | |
| 109 "value": database_value, | |
| 110 "name": database_name, | |
| 111 "path": database_path, | |
| 112 } | |
| 113 ] | |
| 114 } | |
| 115 } | |
| 116 | |
| 117 return data_table_entry | |
| 118 | |
| 119 | |
| 120 def kraken2_build_minikraken(minikraken2_version, minikraken2_release, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 121 | |
| 122 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 123 | |
| 124 value_parts = [now, "minikraken2", minikraken2_release, "8GB"] | |
| 125 name_parts = ["Minikraken2", minikraken2_release, "8GB", "(Created: %s)" % now] | |
| 126 | |
| 127 if minikraken2_release == 'April_2019': | |
| 128 value_parts.insert(3, minikraken2_version) | |
| 129 name_parts.insert(2, minikraken2_version) | |
| 130 src = urlopen( | |
| 131 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_%s_8GB_201904.tgz' | |
| 132 % minikraken2_version | |
| 133 ) | |
| 134 else: | |
| 135 src = urlopen('ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz') | |
| 136 | |
| 137 database_value = "_".join(value_parts) | |
| 138 | |
| 139 database_name = " ".join(name_parts) | |
| 140 | |
| 141 database_path = database_value | |
| 142 | |
| 143 # download the minikraken2 data | |
| 144 with open('tmp_data.tar.gz', 'wb') as dst: | |
| 145 shutil.copyfileobj(src, dst) | |
| 146 # unpack the downloaded archive to the target directory | |
| 147 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
| 148 for member in fh.getmembers(): | |
| 149 if member.isreg(): | |
| 150 member.name = os.path.basename(member.name) | |
| 151 fh.extract(member, os.path.join(target_directory, database_path)) | |
| 152 | |
| 153 data_table_entry = { | |
| 154 'data_tables': { | |
| 155 data_table_name: [ | |
| 156 { | |
| 157 "value": database_value, | |
| 158 "name": database_name, | |
| 159 "path": database_path, | |
| 160 } | |
| 161 ] | |
| 162 } | |
| 163 } | |
| 164 | |
| 165 return data_table_entry | |
| 166 | |
| 167 | |
| 168 def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 169 | |
| 170 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 171 | |
| 172 special_database_names = { | |
| 173 "rdp": "RDP", | |
| 174 "greengenes": "Greengenes", | |
| 175 "silva": "Silva", | |
| 176 } | |
| 177 | |
| 178 database_value = "_".join([ | |
| 179 now, | |
| 180 kraken2_args["special_database_type"], | |
| 181 "kmer-len", str(kraken2_args["kmer_len"]), | |
| 182 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
| 183 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
| 184 "load-factor", str(kraken2_args["load_factor"]), | |
| 185 ]) | |
| 186 | |
| 187 database_name = " ".join([ | |
| 188 special_database_names[kraken2_args["special_database_type"]], | |
| 189 "(Created:", | |
| 190 now + ",", | |
| 191 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
| 192 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
| 193 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
| 194 "load-factor=" + str(kraken2_args["load_factor"]) + ")", | |
| 195 ]) | |
| 196 | |
| 197 database_path = database_value | |
| 198 | |
| 199 args = [ | |
| 200 '--threads', str(kraken2_args["threads"]), | |
| 201 '--special', kraken2_args["special_database_type"], | |
| 202 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 203 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 204 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 205 '--load-factor', str(kraken2_args["load_factor"]), | |
| 206 '--db', database_path | |
| 207 ] | |
| 208 | |
| 209 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 210 | |
| 211 if kraken2_args["clean"]: | |
| 212 args = [ | |
| 213 '--threads', str(kraken2_args["threads"]), | |
| 214 '--clean', | |
| 215 '--db', database_path | |
| 216 ] | |
| 217 | |
| 218 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 219 | |
| 220 data_table_entry = { | |
| 221 'data_tables': { | |
| 222 data_table_name: [ | |
| 223 { | |
| 224 "value": database_value, | |
| 225 "name": database_name, | |
| 226 "path": database_path, | |
| 227 } | |
| 228 ] | |
| 229 } | |
| 230 } | |
| 231 | |
| 232 return data_table_entry | |
| 233 | |
| 234 | |
| 235 def kraken2_build_custom(kraken2_args, custom_database_name, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 236 | |
| 237 args = [ | |
| 238 '--threads', str(kraken2_args["threads"]), | |
| 239 '--download-taxonomy', | |
| 240 '--db', custom_database_name, | |
| 241 ] | |
| 242 | |
| 243 if kraken2_args['skip_maps']: | |
| 244 args.append('--skip-maps') | |
| 245 | |
| 246 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 247 | |
| 248 args = [ | |
| 249 '--threads', str(kraken2_args["threads"]), | |
| 250 '--add-to-library', kraken2_args["custom_fasta"], | |
| 251 '--db', custom_database_name | |
| 252 ] | |
| 253 | |
| 254 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 255 | |
| 256 args = [ | |
| 257 '--threads', str(kraken2_args["threads"]), | |
| 258 '--build', | |
| 259 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 260 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 261 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 262 '--load-factor', str(kraken2_args["load_factor"]), | |
| 263 '--db', custom_database_name | |
| 264 ] | |
| 265 | |
| 266 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 267 | |
| 268 if kraken2_args["clean"]: | |
| 269 args = [ | |
| 270 '--threads', str(kraken2_args["threads"]), | |
| 271 '--clean', | |
| 272 '--db', custom_database_name | |
| 273 ] | |
| 274 | |
| 275 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 276 | |
| 277 data_table_entry = { | |
| 278 'data_tables': { | |
| 279 data_table_name: [ | |
| 280 { | |
| 281 "value": custom_database_name, | |
| 282 "name": custom_database_name, | |
| 283 "path": custom_database_name | |
| 284 } | |
| 285 ] | |
| 286 } | |
| 287 } | |
| 288 | |
| 289 return data_table_entry | |
| 290 | |
| 291 | |
| 292 def main(): | |
| 293 parser = argparse.ArgumentParser() | |
| 294 parser.add_argument('data_manager_json') | |
| 295 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length') | |
| 296 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length') | |
| 297 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') | |
| 298 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') | |
| 299 parser.add_argument('--threads', dest='threads', default=1, help='threads') | |
| 300 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') | |
| 301 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken) and the Mar2019 release') | |
| 302 parser.add_argument('--minikraken2-release', dest='minikraken2_release', type=Minikraken2Releases, choices=list(Minikraken2Releases), help='MiniKraken2 release (only applies to --database-type minikraken)') | |
| 303 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') | |
| 304 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') | |
| 305 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') | |
| 306 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') | |
| 307 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') | |
| 308 args = parser.parse_args() | |
| 309 | |
| 310 with open(args.data_manager_json) as fh: | |
| 311 data_manager_input = json.load(fh) | |
| 312 | |
| 313 target_directory = data_manager_input['output_data'][0]['extra_files_path'] | |
| 314 | |
| 315 try: | |
| 316 os.mkdir(target_directory) | |
| 317 except OSError as exc: | |
| 318 if exc.errno == errno.EEXIST and os.path.isdir(target_directory): | |
| 319 pass | |
| 320 else: | |
| 321 raise | |
| 322 | |
| 323 data_manager_output = {} | |
| 324 | |
| 325 if str(args.database_type) == 'standard': | |
| 326 kraken2_args = { | |
| 327 "kmer_len": args.kmer_len, | |
| 328 "minimizer_len": args.minimizer_len, | |
| 329 "minimizer_spaces": args.minimizer_spaces, | |
| 330 "load_factor": args.load_factor, | |
| 331 "threads": args.threads, | |
| 332 "clean": args.clean, | |
| 333 } | |
| 334 data_manager_output = kraken2_build_standard( | |
| 335 kraken2_args, | |
| 336 target_directory, | |
| 337 ) | |
| 338 elif str(args.database_type) == 'minikraken': | |
| 339 data_manager_output = kraken2_build_minikraken( | |
| 340 str(args.minikraken2_version), | |
| 341 str(args.minikraken2_release), | |
| 342 target_directory | |
| 343 ) | |
| 344 elif str(args.database_type) == 'special': | |
| 345 kraken2_args = { | |
| 346 "special_database_type": str(args.special_database_type), | |
| 347 "kmer_len": args.kmer_len, | |
| 348 "minimizer_len": args.minimizer_len, | |
| 349 "minimizer_spaces": args.minimizer_spaces, | |
| 350 "load_factor": args.load_factor, | |
| 351 "threads": args.threads, | |
| 352 "clean": args.clean, | |
| 353 } | |
| 354 data_manager_output = kraken2_build_special( | |
| 355 kraken2_args, | |
| 356 target_directory, | |
| 357 ) | |
| 358 elif str(args.database_type) == 'custom': | |
| 359 kraken2_args = { | |
| 360 "custom_fasta": args.custom_fasta, | |
| 361 "skip_maps": args.skip_maps, | |
| 362 "kmer_len": args.kmer_len, | |
| 363 "minimizer_len": args.minimizer_len, | |
| 364 "minimizer_spaces": args.minimizer_spaces, | |
| 365 "load_factor": args.load_factor, | |
| 366 "threads": args.threads, | |
| 367 "clean": args.clean, | |
| 368 } | |
| 369 data_manager_output = kraken2_build_custom( | |
| 370 kraken2_args, | |
| 371 args.custom_database_name, | |
| 372 target_directory, | |
| 373 ) | |
| 374 else: | |
| 375 sys.exit("Invalid database type") | |
| 376 | |
| 377 with open(args.data_manager_json, 'w') as fh: | |
| 378 json.dump(data_manager_output, fh, sort_keys=True) | |
| 379 | |
| 380 | |
| 381 if __name__ == "__main__": | |
| 382 main() |
