Mercurial > repos > bgruening > data_manager_semibin
comparison data_manager/data_manager_semibin.py @ 0:1e4dd26db773 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_semibin commit aa9bfb2fb62547ee8bac34f0de5b3beaa0bfd1a4"
| author | bgruening |
|---|---|
| date | Fri, 14 Oct 2022 21:29:47 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1e4dd26db773 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # | |
| 3 # Data manager for reference data for the MetaPhlAn Galaxy tools | |
| 4 import argparse | |
| 5 import json | |
| 6 import subprocess | |
| 7 from datetime import date | |
| 8 from pathlib import Path | |
| 9 | |
| 10 | |
| 11 # Utility functions for interacting with Galaxy JSON | |
| 12 def read_input_json(json_fp): | |
| 13 """Read the JSON supplied from the data manager tool | |
| 14 | |
| 15 Returns a tuple (param_dict,extra_files_path) | |
| 16 | |
| 17 'param_dict' is an arbitrary dictionary of parameters | |
| 18 input into the tool; 'extra_files_path' is the path | |
| 19 to a directory where output files must be put for the | |
| 20 receiving data manager to pick them up. | |
| 21 | |
| 22 NB the directory pointed to by 'extra_files_path' | |
| 23 doesn't exist initially, it is the job of the script | |
| 24 to create it if necessary. | |
| 25 | |
| 26 """ | |
| 27 with open(json_fp) as fh: | |
| 28 params = json.load(fh) | |
| 29 return (params['param_dict'], | |
| 30 Path(params['output_data'][0]['extra_files_path'])) | |
| 31 | |
| 32 | |
| 33 # Utility functions for creating data table dictionaries | |
| 34 # | |
| 35 # Example usage: | |
| 36 # >>> d = create_data_tables_dict() | |
| 37 # >>> add_data_table(d,'my_data') | |
| 38 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) | |
| 39 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) | |
| 40 # >>> print(json.dumps(d)) | |
| 41 def create_data_tables_dict(): | |
| 42 """Return a dictionary for storing data table information | |
| 43 | |
| 44 Returns a dictionary that can be used with 'add_data_table' | |
| 45 and 'add_data_table_entry' to store information about a | |
| 46 data table. It can be converted to JSON to be sent back to | |
| 47 the data manager. | |
| 48 | |
| 49 """ | |
| 50 d = { | |
| 51 'data_tables': {} | |
| 52 } | |
| 53 return d | |
| 54 | |
| 55 | |
| 56 def add_data_table(d, table): | |
| 57 """Add a data table to the data tables dictionary | |
| 58 | |
| 59 Creates a placeholder for a data table called 'table'. | |
| 60 | |
| 61 """ | |
| 62 d['data_tables'][table] = [] | |
| 63 | |
| 64 | |
| 65 def add_data_table_entry(d, table, entry): | |
| 66 """Add an entry to a data table | |
| 67 | |
| 68 Appends an entry to the data table 'table'. 'entry' | |
| 69 should be a dictionary where the keys are the names of | |
| 70 columns in the data table. | |
| 71 | |
| 72 Raises an exception if the named data table doesn't | |
| 73 exist. | |
| 74 | |
| 75 """ | |
| 76 try: | |
| 77 d['data_tables'][table].append(entry) | |
| 78 except KeyError: | |
| 79 raise Exception("add_data_table_entry: no table '%s'" % table) | |
| 80 | |
| 81 | |
| 82 def download_gtdb(data_tables, table_name, target_dp, test=False): | |
| 83 """Download GTDB | |
| 84 | |
| 85 Creates references to the specified file(s) on the Galaxy | |
| 86 server in the appropriate data table (determined from the | |
| 87 file extension). | |
| 88 | |
| 89 The 'data_tables' dictionary should have been created using | |
| 90 the 'create_data_tables_dict' and 'add_data_table' functions. | |
| 91 | |
| 92 Arguments: | |
| 93 data_tables: a dictionary containing the data table info | |
| 94 table_name: name of the table | |
| 95 target_dp: directory to put copy or link to the data file | |
| 96 | |
| 97 """ | |
| 98 db_dp = target_dp | |
| 99 if not test: | |
| 100 cmd = "SemiBin download_GTDB --reference-db-data-dir %s" % (db_dp) | |
| 101 subprocess.check_call(cmd, shell=True) | |
| 102 dbkey = 'gtdb' | |
| 103 name = "GTDB reference genome generated by MMseqs2 used in SemiBin" | |
| 104 else: | |
| 105 dbkey = 'test' | |
| 106 name = "Test" | |
| 107 empty_fp = db_dp / Path("empty") | |
| 108 empty_fp.touch() | |
| 109 add_data_table_entry( | |
| 110 data_tables, | |
| 111 table_name, | |
| 112 dict( | |
| 113 dbkey=dbkey, | |
| 114 value='%s' % (date.today().strftime("%d%m%Y")), | |
| 115 name=name, | |
| 116 path=str(db_dp))) | |
| 117 | |
| 118 | |
| 119 if __name__ == "__main__": | |
| 120 print("Starting...") | |
| 121 | |
| 122 # Read command line | |
| 123 parser = argparse.ArgumentParser(description='Download reference genomes (GTDB)') | |
| 124 parser.add_argument('--json', help="Path to JSON file") | |
| 125 parser.add_argument('--test', action='store_true', help="Test") | |
| 126 args = parser.parse_args() | |
| 127 print("args : %s" % args) | |
| 128 | |
| 129 # Read the input JSON | |
| 130 json_fp = Path(args.json) | |
| 131 params, target_dp = read_input_json(json_fp) | |
| 132 | |
| 133 # Make the target directory | |
| 134 print("Making %s" % target_dp) | |
| 135 target_dp.mkdir(parents=True, exist_ok=True) | |
| 136 | |
| 137 # Set up data tables dictionary | |
| 138 data_tables = create_data_tables_dict() | |
| 139 add_data_table(data_tables, "gtdb") | |
| 140 | |
| 141 # Fetch data from specified data sources | |
| 142 print("Download and build database") | |
| 143 download_gtdb( | |
| 144 data_tables, | |
| 145 "gtdb", | |
| 146 target_dp, | |
| 147 args.test) | |
| 148 | |
| 149 # Write output JSON | |
| 150 print("Outputting JSON") | |
| 151 with open(json_fp, 'w') as fh: | |
| 152 json.dump(data_tables, fh, sort_keys=True) | |
| 153 print("Done.") |
