Mercurial > repos > iuc > data_manager_mothur_toolsuite
comparison data_manager/fetch_mothur_reference_data.py @ 4:0db22932bc39 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit f845716f6ac93500f143a30abef97eaba406344e"
| author | iuc |
|---|---|
| date | Fri, 25 Jun 2021 09:37:05 +0000 |
| parents | 9d09724f2bf1 |
| children |
comparison
equal
deleted
inserted
replaced
| 3:9d09724f2bf1 | 4:0db22932bc39 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python3 |
| 2 # | 2 # |
| 3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools | 3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools |
| 4 import io | |
| 4 import json | 5 import json |
| 5 import optparse | 6 import optparse |
| 6 import os | 7 import os |
| 7 import shutil | 8 import shutil |
| 8 import sys | 9 import sys |
| 9 import tarfile | 10 import tarfile |
| 10 import tempfile | 11 import tempfile |
| 11 import urllib2 | 12 import urllib.error |
| 13 import urllib.parse | |
| 14 import urllib.request | |
| 12 import zipfile | 15 import zipfile |
| 13 from functools import reduce | 16 from functools import reduce |
| 14 | 17 |
| 15 # When extracting files from archives, skip names that | 18 # When extracting files from archives, skip names that |
| 16 # start with the following strings | 19 # start with the following strings |
| 36 "lookup_gs20": { | 39 "lookup_gs20": { |
| 37 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] | 40 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] |
| 38 }, | 41 }, |
| 39 # RDP reference files | 42 # RDP reference files |
| 40 # http://www.mothur.org/wiki/RDP_reference_files | 43 # http://www.mothur.org/wiki/RDP_reference_files |
| 44 "RDP_v18": { | |
| 45 "16S rRNA RDP training set 18": | |
| 46 [ | |
| 47 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ], | |
| 48 "16S rRNA PDS training set 18": | |
| 49 [ | |
| 50 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ], | |
| 51 }, | |
| 41 "RDP_v16": { | 52 "RDP_v16": { |
| 42 "16S rRNA RDP training set 16": | 53 "16S rRNA RDP training set 16": |
| 43 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], | 54 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], |
| 44 "16S rRNA PDS training set 16": | 55 "16S rRNA PDS training set 16": |
| 45 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], | 56 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], |
| 74 "RDP training set 6": | 85 "RDP training set 6": |
| 75 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], | 86 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], |
| 76 }, | 87 }, |
| 77 # Silva reference files | 88 # Silva reference files |
| 78 # http://www.mothur.org/wiki/Silva_reference_files | 89 # http://www.mothur.org/wiki/Silva_reference_files |
| 90 "silva_release_138.1": { | |
| 91 "SILVA release 138.1": | |
| 92 [ | |
| 93 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz", | |
| 94 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ], | |
| 95 }, | |
| 79 "silva_release_128": { | 96 "silva_release_128": { |
| 80 "SILVA release 128": | 97 "SILVA release 128": |
| 81 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", | 98 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", |
| 82 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], | 99 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], |
| 83 }, | 100 }, |
| 228 system. | 245 system. |
| 229 | 246 |
| 230 Returns the name that the file is saved with. | 247 Returns the name that the file is saved with. |
| 231 | 248 |
| 232 """ | 249 """ |
| 233 print("Downloading %s" % url) | 250 print(f"Downloading {url}") |
| 234 if not target: | 251 if not target: |
| 235 target = os.path.basename(url) | 252 target = os.path.basename(url) |
| 236 if wd: | 253 if wd: |
| 237 target = os.path.join(wd, target) | 254 target = os.path.join(wd, target) |
| 238 print("Saving to %s" % target) | 255 print(f"Saving to {target}") |
| 239 with open(target, 'wb') as fh: | 256 with open(target, 'wb') as fh: |
| 240 fh.write(urllib2.urlopen(url).read()) | 257 url_h = urllib.request.urlopen(url) |
| 258 while True: | |
| 259 buffer = url_h.read(io.DEFAULT_BUFFER_SIZE) | |
| 260 if buffer == b"": | |
| 261 break | |
| 262 fh.write(buffer) | |
| 241 return target | 263 return target |
| 242 | 264 |
| 243 | 265 |
| 244 def unpack_zip_archive(filen, wd=None): | 266 def unpack_zip_archive(filen, wd=None): |
| 245 """Extract files from a ZIP archive | 267 """Extract files from a ZIP archive |
| 255 Once all the files are extracted the ZIP archive | 277 Once all the files are extracted the ZIP archive |
| 256 file is deleted from the file system. | 278 file is deleted from the file system. |
| 257 | 279 |
| 258 """ | 280 """ |
| 259 if not zipfile.is_zipfile(filen): | 281 if not zipfile.is_zipfile(filen): |
| 260 print("%s: not ZIP formatted file") | 282 print(f"{filen}: not ZIP formatted file") |
| 261 return [filen] | 283 return [filen] |
| 262 file_list = [] | 284 file_list = [] |
| 263 with zipfile.ZipFile(filen) as z: | 285 with zipfile.ZipFile(filen) as z: |
| 264 for name in z.namelist(): | 286 for name in z.namelist(): |
| 265 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): | 287 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): |
| 266 print("Ignoring %s" % name) | 288 print(f"Ignoring {name}") |
| 267 continue | 289 continue |
| 268 if wd: | 290 if wd: |
| 269 target = os.path.join(wd, name) | 291 target = os.path.join(wd, name) |
| 270 else: | 292 else: |
| 271 target = name | 293 target = name |
| 272 if name.endswith('/'): | 294 if name.endswith('/'): |
| 273 # Make directory | 295 # Make directory |
| 274 print("Creating dir %s" % target) | 296 print(f"Creating dir {target}") |
| 275 try: | 297 try: |
| 276 os.makedirs(target) | 298 os.makedirs(target) |
| 277 except OSError: | 299 except OSError: |
| 278 pass | 300 pass |
| 279 else: | 301 else: |
| 280 # Extract file | 302 # Extract file |
| 281 print("Extracting %s" % name) | 303 print("Extracting {target}") |
| 282 try: | 304 try: |
| 283 os.makedirs(os.path.dirname(target)) | 305 os.makedirs(os.path.dirname(target)) |
| 284 except OSError: | 306 except OSError: |
| 285 pass | 307 pass |
| 286 with open(target, 'wb') as fh: | 308 with open(target, 'wb') as fh: |
| 287 fh.write(z.read(name)) | 309 fh.write(z.read(name)) |
| 288 file_list.append(target) | 310 file_list.append(target) |
| 289 print("Removing %s" % filen) | 311 print(f"Removing {filen}") |
| 290 os.remove(filen) | 312 os.remove(filen) |
| 291 return file_list | 313 return file_list |
| 292 | 314 |
| 293 | 315 |
| 294 def unpack_tar_archive(filen, wd=None): | 316 def unpack_tar_archive(filen, wd=None): |
| 307 file is deleted from the file system. | 329 file is deleted from the file system. |
| 308 | 330 |
| 309 """ | 331 """ |
| 310 file_list = [] | 332 file_list = [] |
| 311 if not tarfile.is_tarfile(filen): | 333 if not tarfile.is_tarfile(filen): |
| 312 print("%s: not TAR file") | 334 print(f"{filen}: not TAR file") |
| 313 return [filen] | 335 return [filen] |
| 314 with tarfile.open(filen) as t: | 336 with tarfile.open(filen) as t: |
| 315 for name in t.getnames(): | 337 for name in t.getnames(): |
| 316 # Check for unwanted files | 338 # Check for unwanted files |
| 317 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): | 339 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): |
| 318 print("Ignoring %s" % name) | 340 print(f"Ignoring {name}") |
| 319 continue | 341 continue |
| 320 # Extract file | 342 # Extract file |
| 321 print("Extracting %s" % name) | 343 print(f"Extracting {name}") |
| 322 t.extract(name, wd) | 344 t.extract(name, wd) |
| 323 if wd: | 345 if wd: |
| 324 target = os.path.join(wd, name) | 346 target = os.path.join(wd, name) |
| 325 else: | 347 else: |
| 326 target = name | 348 target = name |
| 327 file_list.append(target) | 349 file_list.append(target) |
| 328 print("Removing %s" % filen) | 350 print(f"Removing {filen}") |
| 329 os.remove(filen) | 351 os.remove(filen) |
| 330 return file_list | 352 return file_list |
| 331 | 353 |
| 332 | 354 |
| 333 def unpack_archive(filen, wd=None): | 355 def unpack_archive(filen, wd=None): |
| 341 'wd' specifies the working directory to extract | 363 'wd' specifies the working directory to extract |
| 342 the files to, otherwise they are extracted to the | 364 the files to, otherwise they are extracted to the |
| 343 current working directory. | 365 current working directory. |
| 344 | 366 |
| 345 """ | 367 """ |
| 346 print("Unpack %s" % filen) | 368 print(f"Unpack {filen}") |
| 347 ext = os.path.splitext(filen)[1] | 369 ext = os.path.splitext(filen)[1] |
| 348 print("Extension: %s" % ext) | 370 print(f"Extension: {ext}") |
| 349 if ext == ".zip": | 371 if ext == ".zip": |
| 350 return unpack_zip_archive(filen, wd=wd) | 372 return unpack_zip_archive(filen, wd=wd) |
| 351 elif ext == ".tgz": | 373 elif ext == ".tgz": |
| 352 return unpack_tar_archive(filen, wd=wd) | 374 return unpack_tar_archive(filen, wd=wd) |
| 353 else: | 375 else: |
| 384 """ | 406 """ |
| 385 ext = os.path.splitext(filen)[1] | 407 ext = os.path.splitext(filen)[1] |
| 386 try: | 408 try: |
| 387 return MOTHUR_FILE_TYPES[ext] | 409 return MOTHUR_FILE_TYPES[ext] |
| 388 except KeyError: | 410 except KeyError: |
| 389 print("WARNING: unknown file type for " + filen + ", skipping") | 411 print(f"WARNING: unknown file type for {filen}, skipping") |
| 390 return None | 412 return None |
| 391 | 413 |
| 392 | 414 |
| 393 def get_name(filen): | 415 def get_name(filen): |
| 394 """Generate a descriptive name based on the file name | 416 """Generate a descriptive name based on the file name |
| 417 datasets: a list of dataset names corresponding to keys in | 439 datasets: a list of dataset names corresponding to keys in |
| 418 the MOTHUR_REFERENCE_DATA dictionary | 440 the MOTHUR_REFERENCE_DATA dictionary |
| 419 """ | 441 """ |
| 420 # Make working dir | 442 # Make working dir |
| 421 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) | 443 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) |
| 422 print("Working dir %s" % wd) | 444 print(f"Working dir {wd}") |
| 423 # Iterate over all requested reference data URLs | 445 # Iterate over all requested reference data URLs |
| 424 for dataset in datasets: | 446 for dataset in datasets: |
| 425 print("Handling dataset '%s'" % dataset) | 447 print(f"Handling dataset '{dataset}'") |
| 426 for name in MOTHUR_REFERENCE_DATA[dataset]: | 448 for name in MOTHUR_REFERENCE_DATA[dataset]: |
| 427 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): | 449 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): |
| 428 type_ = identify_type(f) | 450 type_ = identify_type(f) |
| 429 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name) | 451 name_from_file = os.path.splitext(os.path.basename(f))[0] |
| 430 print("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f))) | 452 entry_name = f"{name_from_file} ({name})" |
| 453 print(f"{type_}\t\'{entry_name}'\t.../{os.path.basename(f)}") | |
| 431 if type_ is not None: | 454 if type_ is not None: |
| 432 # Move to target dir | 455 # Move to target dir |
| 433 ref_data_file = os.path.basename(f) | 456 ref_data_file = os.path.basename(f) |
| 434 f1 = os.path.join(target_dir, ref_data_file) | 457 f1 = os.path.join(target_dir, ref_data_file) |
| 435 print("Moving %s to %s" % (f, f1)) | 458 print(f"Moving {f} to {f1}") |
| 436 os.rename(f, f1) | 459 shutil.move(f, f1) |
| 437 # Add entry to data table | 460 # Add entry to data table |
| 438 table_name = "mothur_%s" % type_ | 461 table_name = f"mothur_{type_}" |
| 439 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) | 462 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) |
| 440 # Remove working dir | 463 # Remove working dir |
| 441 print("Removing %s" % wd) | 464 print(f"Removing {wd}") |
| 442 shutil.rmtree(wd) | 465 shutil.rmtree(wd) |
| 443 | 466 |
| 444 | 467 |
| 445 def files_from_filesystem_paths(paths): | 468 def files_from_filesystem_paths(paths): |
| 446 """Return list of file paths from arbitrary input paths | 469 """Return list of file paths from arbitrary input paths |
| 452 """ | 475 """ |
| 453 # Collect files to add | 476 # Collect files to add |
| 454 files = [] | 477 files = [] |
| 455 for path in paths: | 478 for path in paths: |
| 456 path = os.path.abspath(path) | 479 path = os.path.abspath(path) |
| 457 print("Examining '%s'..." % path) | 480 print(f"Examining '{path}'...") |
| 458 if os.path.isfile(path): | 481 if os.path.isfile(path): |
| 459 # Store full path for file | 482 # Store full path for file |
| 460 files.append(path) | 483 files.append(path) |
| 461 elif os.path.isdir(path): | 484 elif os.path.isdir(path): |
| 462 # Descend into directory and collect the files | 485 # Descend into directory and collect the files |
| 491 files = files_from_filesystem_paths(paths) | 514 files = files_from_filesystem_paths(paths) |
| 492 # Handle each file individually | 515 # Handle each file individually |
| 493 for f in files: | 516 for f in files: |
| 494 type_ = identify_type(f) | 517 type_ = identify_type(f) |
| 495 if type_ is None: | 518 if type_ is None: |
| 496 print("%s: unrecognised type, skipped" % f) | 519 print(f"{f}: unrecognised type, skipped") |
| 497 continue | 520 continue |
| 498 ref_data_file = os.path.basename(f) | 521 ref_data_file = os.path.basename(f) |
| 499 target_file = os.path.join(target_dir, ref_data_file) | 522 target_file = os.path.join(target_dir, ref_data_file) |
| 500 entry_name = "%s" % os.path.splitext(ref_data_file)[0] | 523 entry_name = "%s" % os.path.splitext(ref_data_file)[0] |
| 501 if description: | 524 if description: |
| 502 entry_name += " (%s)" % description | 525 entry_name += " (%s)" % description |
| 503 print("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file)) | 526 print(f"{type_}\t\'{entry_name}'\t.../{ref_data_file}") |
| 504 # Link to or copy the data | 527 # Link to or copy the data |
| 505 if link_to_data: | 528 if link_to_data: |
| 506 os.symlink(f, target_file) | 529 os.symlink(f, target_file) |
| 507 else: | 530 else: |
| 508 shutil.copyfile(f, target_file) | 531 shutil.copyfile(f, target_file) |
| 509 # Add entry to data table | 532 # Add entry to data table |
| 510 table_name = "mothur_%s" % type_ | 533 table_name = f"mothur_{type_}" |
| 511 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) | 534 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) |
| 512 | 535 |
| 513 | 536 |
| 514 if __name__ == "__main__": | 537 if __name__ == "__main__": |
| 515 print("Starting...") | 538 print("Starting...") |
| 520 parser.add_option('--datasets', action='store', dest='datasets', default='') | 543 parser.add_option('--datasets', action='store', dest='datasets', default='') |
| 521 parser.add_option('--paths', action='store', dest='paths', default=[]) | 544 parser.add_option('--paths', action='store', dest='paths', default=[]) |
| 522 parser.add_option('--description', action='store', dest='description', default='') | 545 parser.add_option('--description', action='store', dest='description', default='') |
| 523 parser.add_option('--link', action='store_true', dest='link_to_data') | 546 parser.add_option('--link', action='store_true', dest='link_to_data') |
| 524 options, args = parser.parse_args() | 547 options, args = parser.parse_args() |
| 525 print("options: %s" % options) | 548 print(f"options: {options}") |
| 526 print("args : %s" % args) | 549 print(f"args : {args}") |
| 527 | 550 |
| 528 # Check for JSON file | 551 # Check for JSON file |
| 529 if len(args) != 1: | 552 if len(args) != 1: |
| 530 sys.stderr.write("Need to supply JSON file name") | 553 sys.stderr.write("Need to supply JSON file name") |
| 531 sys.exit(1) | 554 sys.exit(1) |
| 534 | 557 |
| 535 # Read the input JSON | 558 # Read the input JSON |
| 536 params, target_dir = read_input_json(jsonfile) | 559 params, target_dir = read_input_json(jsonfile) |
| 537 | 560 |
| 538 # Make the target directory | 561 # Make the target directory |
| 539 print("Making %s" % target_dir) | 562 print(f"Making {target_dir}") |
| 540 os.mkdir(target_dir) | 563 os.mkdir(target_dir) |
| 541 | 564 |
| 542 # Set up data tables dictionary | 565 # Set up data tables dictionary |
| 543 data_tables = create_data_tables_dict() | 566 data_tables = create_data_tables_dict() |
| 544 add_data_table(data_tables, 'mothur_lookup') | 567 add_data_table(data_tables, 'mothur_lookup') |
