Mercurial > repos > iuc > data_manager_gemini_database_downloader
view data_manager/data_manager_gemini_download.py @ 9:27a6a256cd23 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gemini_database_downloader commit 275b7863ff4f8b0dff9cd7ea6c4b635694f0168d
author | iuc |
---|---|
date | Sat, 03 Dec 2022 10:37:24 +0000 |
parents | 52b6a4d98009 |
children |
line wrap: on
line source
#!/usr/bin/env python2 # IMPORTANT: This will run using Python 2 still! import datetime import json import os import subprocess import sys import yaml def write_gemini_config(config, config_file): with open(config_file, 'w') as fo: yaml.dump(config, fo, allow_unicode=False, default_flow_style=False) def load_gemini_config(config_file): with open(config_file) as fi: return yaml.load(fi) def main(): today = datetime.date.today() with open(sys.argv[1]) as fh: params = json.load(fh) target_directory = params['output_data'][0]['extra_files_path'] os.mkdir(target_directory) # Prepare the metadata for the new data table record # The name of the database should reflect whether it was built with or # without the optional GERP-bp data, the CADD scores, or both. # This builds up the correpsonding part of the name: anno_extras = [] if params['param_dict']['gerp_bp']: anno_extras.append('GERP') if params['param_dict']['cadd']: anno_extras.append('CADD') if anno_extras: anno_desc = ' w/ ' + ' & '.join(anno_extras) else: anno_desc = '' data_manager_dict = { 'data_tables': { 'gemini_versioned_databases': [ { 'value': today.isoformat(), 'dbkey': 'hg19', 'version': params['param_dict']['gemini_db_version'], 'name': 'GEMINI annotations%s (%s snapshot)' % ( anno_desc, today.isoformat() ), 'path': './%s' % today.isoformat() } ] } } # Save the data table metadata to the json results file with open(sys.argv[1], 'w') as fh: json.dump(data_manager_dict, fh, sort_keys=True) # Generate a minimal configuration file for GEMINI update # to instruct the tool to download the annotation data into a # subfolder of the target directory. config_file = os.path.join(target_directory, 'gemini-config.yaml') anno_dir = os.path.join(target_directory, 'gemini/data') gemini_bootstrap_config = {'annotation_dir': anno_dir} write_gemini_config(gemini_bootstrap_config, config_file) # Verify that we can read the config_file just created as we need to do so # after the data download has finished and it is very annoying to have this # fail after dozens of Gbs of data have been downloaded config = load_gemini_config(config_file) # Now gemini update can be called to download the data. # The GEMINI_CONFIG environment variable lets the tool discover # the configuration file we prepared for it. # Note that the tool will rewrite the file turning it into a # complete gemini configuration file. gemini_env = os.environ.copy() gemini_env['GEMINI_CONFIG'] = target_directory cmd = ['gemini', 'update', '--dataonly'] if params['param_dict']['gerp_bp']: cmd += ['--extra', 'gerp_bp'] if params['param_dict']['cadd']: cmd += ['--extra', 'cadd_score'] if not params['param_dict']['test_data_manager']: # This is not a test => Going to embark on a massive download now subprocess.check_call(cmd, env=gemini_env) # GEMINI tool wrappers that need access to the annotation files # are supposed to symlink them into a gemini/data subfolder of # the job working directory. To have GEMINI discover them there, # we need to set this location as the 'annotation_dir' in the # configuration file. config = load_gemini_config(config_file) config['annotation_dir'] = 'gemini/data' write_gemini_config(config, config_file) if __name__ == "__main__": main()