Mercurial > repos > iuc > data_manager_vep_cache_downloader
changeset 0:1439dface5bf draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_vep_cache_downloader commit 2db33cd5bcf5e2d7e3a43f11855c4cfc3b1b9f56
author | iuc |
---|---|
date | Wed, 11 May 2022 13:03:06 +0000 (2022-05-11) |
parents | |
children | |
files | data_manager/data_manager_vep_cache_download.py data_manager/data_manager_vep_cache_download.xml data_manager_conf.xml test-data/dbkeys.loc test-data/from_test-meta.data_manager.json test-data/vep_versioned_annotation_cache.loc tool-data/dbkeys.loc.sample tool-data/vep_versioned_annotation_cache.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 10 files changed, 169 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_vep_cache_download.py Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +import json +import os +import re +import sys +import tarfile +from urllib.request import urlretrieve + + +def main(): + # Read in given out_file and create target directory for file download + with open(sys.argv[1]) as fh: + params = json.load(fh) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + + # Process parameters for metadata and file download + url = params['param_dict']['url'].rstrip("/") + "/" + params['param_dict']['file_name'].lstrip("/") + m = re.search(r"(.*?)(merged|refseq)?_vep_(\d+?)_", params['param_dict']['file_name']) + version = str(m.group(3)) + cache_type = m.group(2) if m.group(2) else "default" + species = m.group(1).rstrip("_") + display_name = f"{species.capitalize().replace('_', ' ')} {params['param_dict']['dbkey']} (V{version}{'' if cache_type == 'default' else ', ' + cache_type.capitalize()})" + + # Download and extract given cache archive, remove archive afterwards + final_file, headers = urlretrieve(url, os.path.join(target_directory, params['param_dict']['file_name'])) + tar = tarfile.open(final_file, "r:gz") + tar.extractall(target_directory) + tar.close() + os.remove(final_file) + + # Construct metadata for the new data table entry + data_manager_dict = { + 'data_tables': { + 'vep_versioned_annotation_cache': [ + { + 'value': params['param_dict']['file_name'].strip(".tar.gz"), + 'dbkey': params['param_dict']['dbkey'], + 'version': version, + 'cachetype': cache_type, + 'name': display_name, + 'species': species, + 'path': './%s' % params['param_dict']['file_name'].strip(".tar.gz") + } + ] + } + } + + # Save metadata to out_file + with open(sys.argv[1], 'w') as fh: + json.dump(data_manager_dict, fh, sort_keys=True) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_vep_cache_download.xml Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,45 @@ +<tool id="data_manager_vep_cache_download" name="Download and install VEP cache" version="0.1" tool_type="manage_data" profile="20.01"> + <description>versioned annotation files for VEP</description> + <macros> + <token name="@DB_VERSION@">106</token> + </macros> + <requirements> + <requirement type="package" version="3.7">python</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python '$__tool_directory__/data_manager_vep_cache_download.py' '$out_file' + ]]></command> + <inputs> + <param name="dbkey" type="genomebuild" + label="DBKEY of genome that the VEP cache data is for" + help="" /> + <param name="url" type="text" value="http://ftp.ensembl.org/pub/release-@DB_VERSION@/variation/indexed_vep_cache/" + label="FTP root url for VEP cache files" + help="The pre-filled link leads to the official cache files for VEP version @DB_VERSION@. If you want to download cache files for a different version of VEP, it is sufficient to change the release number in the link accordingly. It is strongly recommended to use a link to indexed cache files, just like the pre-filled one."/> + <param name="file_name" type="text" label="File name of cache file to be downloaded from root url" help="E.g. homo_sapiens_vep_@DB_VERSION@_GRCh38.tar.gz"/> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json"/> + </outputs> + <tests> + <test> + <param name="dbkey" value="ci3"/> + <param name="url" value="http://ftp.ensembl.org/pub/release-@DB_VERSION@/variation/indexed_vep_cache/"/> + <param name="file_name" value="ciona_intestinalis_refseq_vep_@DB_VERSION@_KH.tar.gz"/> + <output name="out_file" file="from_test-meta.data_manager.json"/> + </test> + </tests> + <help> +This tool downloads given versions of VEP cache annotation files and makes them available to Ensembl VEP in Galaxy via the +"vep_versioned_annotation_cache" data table. You should use the indexed version of the cache files and it is strongly +recommended to use the cache files which version number matches the VEP version number. Note that for most genomes there +are three versions of cache data available: default, refseq and merged (combining the former two). Choose the one suitable +for your usage. + +A general introduction to the VEP cache and download links can be found on the official website: +https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html + </help> + <citations> + <citation type="doi">10.1186/s13059-016-0974-4</citation> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_vep_cache_download.xml" id="data_manager_vep_cache_download" > + <data_table name="vep_versioned_annotation_cache"> <!-- Defines a Data Table to be modified. --> + <output> <!-- Handle the output of the Data Manager Tool --> + <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="dbkey" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="cachetype" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="species" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="path" output_ref="out_file" > + <move type="directory" relativize_symlinks="True"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">vep/${version}/${dbkey}/${cachetype}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/vep/${version}/${dbkey}/${cachetype}/</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dbkeys.loc Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,3 @@ +#<dbkey> <display_name> <len_file_path> +hg38 Human hg38 a_path +ce11 C. elegans ce11 a_path \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/from_test-meta.data_manager.json Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"vep_versioned_annotation_cache": [{"cachetype": "refseq", "dbkey": "ci3", "name": "Ciona intestinalis ci3 (V106, Refseq)", "path": "./ciona_intestinalis_refseq_vep_106_KH", "species": "ciona_intestinalis", "value": "ciona_intestinalis_refseq_vep_106_KH", "version": "106"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vep_versioned_annotation_cache.loc Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,6 @@ +#<value> <dbkey> <version> <cachetype> <name> <species> <path> +# +ciona_intestinalis_refseq_vep_105_KH ci3 105 refseq Ciona intestinalis ci3 (V105, Refseq) ciona_intestinalis /home/sebastian/galaxy/tool-data/vep/105/ci3/refseq +caenorhabditis_elegans_vep_105_WBcel235 ce11 105 default Caenorhabditis elegans ce11 (V105) caenorhabditis_elegans /home/sebastian/galaxy/tool-data/vep/105/ce11/default +caenorhabditis_elegans_vep_104_WBcel235 ce11 104 default Caenorhabditis elegans ce11 (V104) caenorhabditis_elegans /home/sebastian/galaxy/tool-data/vep/104/ce11/default +drosophila_melanogaster_vep_105_BDGP6.32 dm6 105 default Drosophila melanogaster dm6 (V105) drosophila_melanogaster /home/sebastian/galaxy/tool-data/vep/105/dm6/default
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/dbkeys.loc.sample Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,1 @@ +#<dbkey> <display_name> <len_file_path> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/vep_versioned_annotation_cache.loc.sample Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,11 @@ +#This file describes vep cache data and its metadata available on the server. +#The data table has the format (white space characters are TAB characters): +# +#<value> <dbkey> <version> <cachetype> <name> <species> <path> +# +#So, vep_versioned_annotation_cache.loc tables could look like this: +# +#homo_sapiens_vep_105_GRCh38 hg38 105 default Homo sapiens hg38 (V105) homo_sapiens /path/to/vep_versioned_annotation_cache/105/hg38/default +#homo_sapiens_refseq_vep_105_GRCh38 hg38 105 refseq Homo sapiens hg38 (V105, Refseq) homo_sapiens /path/to/vep_versioned_annotation_cache/105/hg38/refseq +#homo_sapiens_merged_vep_105_GRCh38 hg38 105 merged Homo sapiens hg38 (V105, Merged) homo_sapiens /path/to/vep_versioned_annotation_cache/105/hg38/merged +# \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,12 @@ +<tables> + <!-- Table of installed versioned vep cache data --> + <table name="vep_versioned_annotation_cache" comment_char="#"> + <columns>value, dbkey, version, cachetype, name, species, path</columns> + <file path="tool-data/vep_versioned_annotation_cache.loc" /> + </table> + <!-- Locations of dbkeys and len files under genome directory --> + <table name="__dbkeys__" comment_char="#"> + <columns>value, name, len_path</columns> + <file path="tool-data/dbkeys.loc" /> + </table> +</tables> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Wed May 11 13:03:06 2022 +0000 @@ -0,0 +1,12 @@ +<tables> + <!-- Table of installed versioned vep cache data --> + <table name="vep_versioned_annotation_cache" comment_char="#"> + <columns>value, dbkey, version, cachetype, name, species, path</columns> + <file path="${__HERE__}/test-data/vep_versioned_annotation_cache.loc" /> + </table> + <!-- Locations of dbkeys and len files under genome directory --> + <table name="__dbkeys__" comment_char="#"> + <columns>value, name, len_path</columns> + <file path="${__HERE__}/test-data/dbkeys.loc" /> + </table> +</tables> \ No newline at end of file