Mercurial > repos > iuc > data_manager_interproscan
view data_manager/interproscan.py @ 3:0df47f8552f6 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_interproscan commit 01e8e726d711c0685f46d3146e4fe0f2a1306036
author | iuc |
---|---|
date | Wed, 07 Dec 2022 13:41:38 +0000 |
parents | 0db4f153d86d |
children |
line wrap: on
line source
#!/usr/bin/env python import argparse import hashlib import json import operator import os import re import shutil import subprocess import sys import tarfile import requests GH_REPO_API = 'https://api.github.com/repos/ebi-pf-team/interproscan/' MD5_URL = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/{version}/interproscan-{version}-64-bit.tar.gz.md5' DATA_URL = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/{version}/interproscan-{version}-64-bit.tar.gz' # For tests: download a smaller archive containing *some* data PARTIAL_URL = 'https://github.com/ebi-pf-team/interproscan/archive/{version}.tar.gz' def list_tags(url=None): if not url: url = GH_REPO_API + 'tags' resp = requests.get(url=url) data = resp.json() tags = [] for tag in data: if re.match(r"^[0-9]\.[0-9]{2}-[0-9]{2}\.[0-9]$", tag['name']): tags.append(tag['name']) if 'next' in resp.links: tags += list_tags(resp.links['next']['url']) return sorted(tags) def download_file(url, dest): with requests.get(url, stream=True) as r: r.raise_for_status() with open(dest, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) def main(): parser = argparse.ArgumentParser(description='Download data for InterProScan') parser.add_argument('--partial', dest='partial', action='store_true', help='Only download a small subset of data (for testing)') parser.add_argument('-v', '--version', help='Specify an InterProScan version (default: latest)') parser.add_argument("datatable_name") parser.add_argument("galaxy_datamanager_filename") args = parser.parse_args() with open(args.galaxy_datamanager_filename) as fh: config = json.load(fh) output_directory = config.get("output_data", [{}])[0].get("extra_files_path", None) data_manager_dict = {} data_manager_dict["data_tables"] = config.get("data_tables", {}) data_manager_dict["data_tables"][args.datatable_name] = data_manager_dict[ "data_tables" ].get(args.datatable_name, []) os.mkdir(output_directory) all_tags = list_tags() if args.version: if args.version not in all_tags: raise RuntimeError("Version '%s' is not valid" % args.version) tag = args.version else: tag = all_tags[-1] setup_script = 'initial_setup.py' sub_version = re.match(r"^[0-9]\.([0-9]{2})-[0-9]{2}\.[0-9]$", tag) if sub_version and len(sub_version.groups()) == 1 and int(sub_version.group(1)) >= 58: # The setup script was renamed in 5.58 setup_script = 'setup.py' else: raise RuntimeError("Sorry, this data manager can only download data for InterProScan >= 5.58-91.0. Use the 0.0.2 version for older versions of InterProScan.") print("Will download data for InterProScan version: %s" % tag) print("Getting MD5 checksum:") md5 = requests.get(url=MD5_URL.format(version=tag)).text if not re.match(r"^([a-fA-F\d]{32}) interproscan-[0-9]\.[0-9]{2}-[0-9]{2}\.[0-9]-64-bit.tar.gz$", md5): raise RuntimeError("Got invalid MD5 from the InterProScan FTP server: '%s'" % md5) print("%s" % md5) if args.partial: print("Downloading partial data tarball...") dest_tar = os.path.join(output_directory, PARTIAL_URL.format(version=tag).split('/')[-1]) download_file(PARTIAL_URL.format(version=tag), dest_tar) else: print("Downloading data tarball...") dest_tar = os.path.join(output_directory, DATA_URL.format(version=tag).split('/')[-1]) download_file(DATA_URL.format(version=tag), dest_tar) print("Finished, now checking md5...") m = hashlib.md5() blocksize = 2**20 with open(dest_tar, 'rb') as tarball: while True: buf = tarball.read(blocksize) if not buf: break m.update(buf) md5_computed = m.hexdigest() if not md5.startswith(md5_computed): raise RuntimeError("MD5 check failed: computed '%s', expected '%s'" % (md5_computed, md5)) print("Ok, now extracting data...") tar = tarfile.open(dest_tar, "r:gz") tar.extractall(output_directory) tar.close() if args.partial: print("Moving partial data files around...") shutil.move(os.path.join(output_directory, 'interproscan-%s' % tag, 'core/jms-implementation/support-mini-x86-32/data/'), os.path.join(output_directory, 'data')) else: print("Moving data files around...") shutil.move(os.path.join(output_directory, 'interproscan-%s' % tag, 'data'), os.path.join(output_directory, 'data')) print("Done, removing tarball and unneeded files...") os.remove(dest_tar) shutil.rmtree(os.path.join(output_directory, 'interproscan-%s' % tag)) print("Running {} (index hmm models)...".format(setup_script)) # Write a temp properties file in work dir prop_file_src = os.path.join(os.path.dirname(os.path.realpath(shutil.which("interproscan.sh"))), 'interproscan.properties') with open(prop_file_src, 'r') as prop: prop_content = prop.read() prop_content = re.sub(r'^data\.directory=.*$', 'data.directory=%s' % os.path.join(output_directory, 'data'), prop_content, flags=re.M) with open('interproscan.properties', 'w') as prop: prop.write(prop_content) # Run the index command cmd_args = [os.path.join(os.path.dirname(os.path.realpath(shutil.which("interproscan.sh"))), setup_script), 'interproscan.properties'] proc = subprocess.Popen(args=cmd_args, shell=False) out, err = proc.communicate() print(out) print(err, file=sys.stderr) return_code = proc.wait() if return_code: print("Error running {}.".format(setup_script), file=sys.stderr) sys.exit(return_code) data_manager_dict["data_tables"][args.datatable_name].append( dict( value=tag, description="InterProScan %s" % tag, interproscan_version=tag, path=output_directory, ) ) print("Saving data table content...") data_manager_dict["data_tables"][args.datatable_name].sort( key=operator.itemgetter("value"), reverse=True ) with open(args.galaxy_datamanager_filename, "w") as fh: json.dump(data_manager_dict, fh, indent=2, sort_keys=True) print("Finished.") if __name__ == "__main__": main()