Mercurial > repos > iuc > data_manager_build_kraken2_database
comparison data_manager/kraken2_build_database.py @ 5:2f27f3b86827 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 648fe4911ce49173697f314d70e63e0de95b7e66"
| author | iuc |
|---|---|
| date | Mon, 08 Nov 2021 15:40:34 +0000 |
| parents | 0eebe086fd58 |
| children | 9002633b4737 |
comparison
equal
deleted
inserted
replaced
| 4:0eebe086fd58 | 5:2f27f3b86827 |
|---|---|
| 14 from enum import Enum | 14 from enum import Enum |
| 15 | 15 |
| 16 try: | 16 try: |
| 17 # Python3 | 17 # Python3 |
| 18 from urllib.request import urlopen | 18 from urllib.request import urlopen |
| 19 from urllib.error import URLError | |
| 19 except ImportError: | 20 except ImportError: |
| 20 from urllib2 import urlopen | 21 from urllib2 import urlopen |
| 22 from urllib2 import URLError | |
| 21 | 23 |
| 22 | 24 |
| 23 DATA_TABLE_NAME = "kraken2_databases" | 25 DATA_TABLE_NAME = "kraken2_databases" |
| 24 | 26 |
| 25 | 27 |
| 26 class KrakenDatabaseTypes(Enum): | 28 class KrakenDatabaseTypes(Enum): |
| 27 standard = 'standard' | 29 standard_local_build = 'standard_local_build' |
| 30 standard_prebuilt = 'standard_prebuilt' | |
| 28 minikraken = 'minikraken' | 31 minikraken = 'minikraken' |
| 29 special = 'special' | 32 special = 'special' |
| 30 custom = 'custom' | 33 custom = 'custom' |
| 31 | 34 |
| 32 def __str__(self): | 35 def __str__(self): |
| 43 | 46 |
| 44 | 47 |
| 45 class Minikraken2Versions(Enum): | 48 class Minikraken2Versions(Enum): |
| 46 v1 = 'v1' | 49 v1 = 'v1' |
| 47 v2 = 'v2' | 50 v2 = 'v2' |
| 51 | |
| 52 def __str__(self): | |
| 53 return self.value | |
| 54 | |
| 55 | |
| 56 class StandardPrebuiltSizes(Enum): | |
| 57 full = 'full' | |
| 58 gb_16 = '16' | |
| 59 gb_8 = '8' | |
| 48 | 60 |
| 49 def __str__(self): | 61 def __str__(self): |
| 50 return self.value | 62 return self.value |
| 51 | 63 |
| 52 | 64 |
| 61 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | 73 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), |
| 62 "load-factor", str(kraken2_args["load_factor"]), | 74 "load-factor", str(kraken2_args["load_factor"]), |
| 63 ]) | 75 ]) |
| 64 | 76 |
| 65 database_name = " ".join([ | 77 database_name = " ".join([ |
| 66 "Standard", | 78 "Standard (Local Build)", |
| 67 "(Created:", | 79 "(Created:", |
| 68 now + ",", | 80 now + ",", |
| 69 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | 81 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", |
| 70 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | 82 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", |
| 71 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | 83 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", |
| 108 } | 120 } |
| 109 | 121 |
| 110 return data_table_entry | 122 return data_table_entry |
| 111 | 123 |
| 112 | 124 |
| 125 def kraken2_build_standard_prebuilt(standard_prebuilt_size, prebuilt_date, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 126 | |
| 127 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 128 | |
| 129 database_value = "_".join([ | |
| 130 now, | |
| 131 "standard_prebuilt", | |
| 132 standard_prebuilt_size | |
| 133 ]) | |
| 134 | |
| 135 database_name = " ".join([ | |
| 136 "Standard (Prebuilt)", | |
| 137 standard_prebuilt_size, | |
| 138 "(Downloaded:", | |
| 139 now + ")" | |
| 140 ]) | |
| 141 | |
| 142 database_path = database_value | |
| 143 | |
| 144 size_to_url_str = { | |
| 145 'full': '', | |
| 146 '16': '_16gb', | |
| 147 '8': '_8gb', | |
| 148 } | |
| 149 # we may need to let the user choose the date when new DBs are posted. | |
| 150 date_url_str = prebuilt_date.replace('-', '') | |
| 151 standard_prebuilt_size_url = size_to_url_str[standard_prebuilt_size] | |
| 152 # download the pre-built database | |
| 153 try: | |
| 154 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard%s_%s.tar.gz' % (standard_prebuilt_size_url, date_url_str) | |
| 155 src = urlopen(download_url) | |
| 156 except URLError as e: | |
| 157 print('url: ' + download_url, file=sys.stderr) | |
| 158 print(e, file=sys.stderr) | |
| 159 exit(1) | |
| 160 | |
| 161 with open('tmp_data.tar.gz', 'wb') as dst: | |
| 162 shutil.copyfileobj(src, dst) | |
| 163 # unpack the downloaded archive to the target directory | |
| 164 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
| 165 for member in fh.getmembers(): | |
| 166 if member.isreg(): | |
| 167 member.name = os.path.basename(member.name) | |
| 168 fh.extract(member, os.path.join(target_directory, database_path)) | |
| 169 | |
| 170 data_table_entry = { | |
| 171 'data_tables': { | |
| 172 data_table_name: [ | |
| 173 { | |
| 174 "value": database_value, | |
| 175 "name": database_name, | |
| 176 "path": database_path, | |
| 177 } | |
| 178 ] | |
| 179 } | |
| 180 } | |
| 181 | |
| 182 return data_table_entry | |
| 183 | |
| 184 | |
| 113 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): | 185 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): |
| 114 | 186 |
| 115 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | 187 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") |
| 116 | 188 |
| 117 database_value = "_".join([ | 189 database_value = "_".join([ |
| 129 ]) | 201 ]) |
| 130 | 202 |
| 131 database_path = database_value | 203 database_path = database_value |
| 132 | 204 |
| 133 # download the minikraken2 data | 205 # download the minikraken2 data |
| 134 src = urlopen( | 206 try: |
| 135 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken2_%s_8GB_201904_UPDATE.tgz' | 207 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/minikraken2_%s_8GB_201904.tgz' % minikraken2_version |
| 136 % minikraken2_version | 208 src = urlopen(download_url) |
| 137 ) | 209 except URLError as e: |
| 210 print('url: ' + download_url, file=sys.stderr) | |
| 211 print(e, file=sys.stderr) | |
| 212 exit(1) | |
| 213 | |
| 138 with open('tmp_data.tar.gz', 'wb') as dst: | 214 with open('tmp_data.tar.gz', 'wb') as dst: |
| 139 shutil.copyfileobj(src, dst) | 215 shutil.copyfileobj(src, dst) |
| 140 # unpack the downloaded archive to the target directory | 216 # unpack the downloaded archive to the target directory |
| 141 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | 217 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: |
| 142 for member in fh.getmembers(): | 218 for member in fh.getmembers(): |
| 291 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') | 367 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') |
| 292 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') | 368 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') |
| 293 parser.add_argument('--threads', dest='threads', default=1, help='threads') | 369 parser.add_argument('--threads', dest='threads', default=1, help='threads') |
| 294 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') | 370 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') |
| 295 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') | 371 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') |
| 372 parser.add_argument('--standard-prebuilt-size', dest='standard_prebuilt_size', type=StandardPrebuiltSizes, choices=list(StandardPrebuiltSizes), help='Size of standard prebuilt database to download (only applies to --database-type standard_prebuilt. Options are: "8", "16", "full".)') | |
| 373 parser.add_argument('--prebuilt-date', dest='prebuilt_date', help='Database build date (YYYY-MM-DD). Only applies to --database-type standard_prebuilt.') | |
| 296 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') | 374 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') |
| 297 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') | 375 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') |
| 298 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') | 376 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') |
| 299 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') | 377 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') |
| 300 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') | 378 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') |
| 313 else: | 391 else: |
| 314 raise | 392 raise |
| 315 | 393 |
| 316 data_manager_output = {} | 394 data_manager_output = {} |
| 317 | 395 |
| 318 if str(args.database_type) == 'standard': | 396 if str(args.database_type) == 'standard_local_build': |
| 319 kraken2_args = { | 397 kraken2_args = { |
| 320 "kmer_len": args.kmer_len, | 398 "kmer_len": args.kmer_len, |
| 321 "minimizer_len": args.minimizer_len, | 399 "minimizer_len": args.minimizer_len, |
| 322 "minimizer_spaces": args.minimizer_spaces, | 400 "minimizer_spaces": args.minimizer_spaces, |
| 323 "load_factor": args.load_factor, | 401 "load_factor": args.load_factor, |
| 325 "clean": args.clean, | 403 "clean": args.clean, |
| 326 } | 404 } |
| 327 data_manager_output = kraken2_build_standard( | 405 data_manager_output = kraken2_build_standard( |
| 328 kraken2_args, | 406 kraken2_args, |
| 329 target_directory, | 407 target_directory, |
| 408 ) | |
| 409 elif str(args.database_type) == 'standard_prebuilt': | |
| 410 data_manager_output = kraken2_build_standard_prebuilt( | |
| 411 str(args.standard_prebuilt_size), | |
| 412 str(args.prebuilt_date), | |
| 413 target_directory | |
| 330 ) | 414 ) |
| 331 elif str(args.database_type) == 'minikraken': | 415 elif str(args.database_type) == 'minikraken': |
| 332 data_manager_output = kraken2_build_minikraken( | 416 data_manager_output = kraken2_build_minikraken( |
| 333 str(args.minikraken2_version), | 417 str(args.minikraken2_version), |
| 334 target_directory | 418 target_directory |
