Mercurial > repos > iuc > data_manager_vep_cache_downloader

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_vep_cache_download.py	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+
+import json
+import os
+import re
+import sys
+import tarfile
+from urllib.request import urlretrieve
+
+
+def main():
+    # Read in given out_file and create target directory for file download
+    with open(sys.argv[1]) as fh:
+        params = json.load(fh)
+    target_directory = params['output_data'][0]['extra_files_path']
+    os.mkdir(target_directory)
+
+    # Process parameters for metadata and file download
+    url = params['param_dict']['url'].rstrip("/") + "/" + params['param_dict']['file_name'].lstrip("/")
+    m = re.search(r"(.*?)(merged|refseq)?_vep_(\d+?)_", params['param_dict']['file_name'])
+    version = str(m.group(3))
+    cache_type = m.group(2) if m.group(2) else "default"
+    species = m.group(1).rstrip("_")
+    display_name = f"{species.capitalize().replace('_', ' ')} {params['param_dict']['dbkey']} (V{version}{'' if cache_type == 'default' else ', ' + cache_type.capitalize()})"
+
+    # Download and extract given cache archive, remove archive afterwards
+    final_file, headers = urlretrieve(url, os.path.join(target_directory, params['param_dict']['file_name']))
+    tar = tarfile.open(final_file, "r:gz")
+    tar.extractall(target_directory)
+    tar.close()
+    os.remove(final_file)
+
+    # Construct metadata for the new data table entry
+    data_manager_dict = {
+        'data_tables': {
+            'vep_versioned_annotation_cache': [
+                {
+                    'value': params['param_dict']['file_name'].strip(".tar.gz"),
+                    'dbkey': params['param_dict']['dbkey'],
+                    'version': version,
+                    'cachetype': cache_type,
+                    'name': display_name,
+                    'species': species,
+                    'path': './%s' % params['param_dict']['file_name'].strip(".tar.gz")
+                }
+            ]
+        }
+    }
+
+    # Save metadata to out_file
+    with open(sys.argv[1], 'w') as fh:
+        json.dump(data_manager_dict, fh, sort_keys=True)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_vep_cache_download.xml	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,45 @@
+<tool id="data_manager_vep_cache_download" name="Download and install VEP cache" version="0.1" tool_type="manage_data" profile="20.01">
+	<description>versioned annotation files for VEP</description>
+	<macros>
+		<token name="@DB_VERSION@">106</token>
+	</macros>
+	<requirements>
+		<requirement type="package" version="3.7">python</requirement>
+	</requirements>
+	<command detect_errors="exit_code"><![CDATA[
+		python '$__tool_directory__/data_manager_vep_cache_download.py' '$out_file'
+	]]></command>
+	<inputs>
+		<param name="dbkey" type="genomebuild"
+			label="DBKEY of genome that the VEP cache data is for"
+			help="" />
+		<param name="url" type="text" value="http://ftp.ensembl.org/pub/release-@DB_VERSION@/variation/indexed_vep_cache/"
+			label="FTP root url for VEP cache files"
+			help="The pre-filled link leads to the official cache files for VEP version @DB_VERSION@. If you want to download cache files for a different version of VEP, it is sufficient to change the release number in the link accordingly. It is strongly recommended to use a link to indexed cache files, just like the pre-filled one."/>
+		<param name="file_name" type="text" label="File name of cache file to be downloaded from root url" help="E.g. homo_sapiens_vep_@DB_VERSION@_GRCh38.tar.gz"/>
+	</inputs>
+	<outputs>
+		<data name="out_file" format="data_manager_json"/>
+	</outputs>
+	<tests>
+	<test>
+		<param name="dbkey" value="ci3"/>
+		<param name="url" value="http://ftp.ensembl.org/pub/release-@DB_VERSION@/variation/indexed_vep_cache/"/>
+		<param name="file_name" value="ciona_intestinalis_refseq_vep_@DB_VERSION@_KH.tar.gz"/>
+		<output name="out_file" file="from_test-meta.data_manager.json"/>
+		</test>
+	</tests>
+	<help>
+This tool downloads given versions of VEP cache annotation files and makes them available to Ensembl VEP in Galaxy via the
+"vep_versioned_annotation_cache" data table. You should use the indexed version of the cache files and it is strongly
+recommended to use the cache files which version number matches the VEP version number. Note that for most genomes there
+are three versions of cache data available: default, refseq and merged (combining the former two). Choose the one suitable
+for your usage.
+
+A general introduction to the VEP cache and download links can be found on the official website:
+https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html
+	</help>
+	<citations>
+		<citation type="doi">10.1186/s13059-016-0974-4</citation>
+	</citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/data_manager_vep_cache_download.xml" id="data_manager_vep_cache_download" >
+        <data_table name="vep_versioned_annotation_cache">  <!-- Defines a Data Table to be modified. -->
+            <output> <!-- Handle the output of the Data Manager Tool -->
+                <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="dbkey" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="cachetype" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="name" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+				<column name="species" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="path" output_ref="out_file" >
+                    <move type="directory" relativize_symlinks="True">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">vep/${version}/${dbkey}/${cachetype}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/vep/${version}/${dbkey}/${cachetype}/</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dbkeys.loc	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,3 @@
+#<dbkey>		<display_name>	<len_file_path>
+hg38			Human hg38		a_path
+ce11			C. elegans ce11	a_path
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/from_test-meta.data_manager.json	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"vep_versioned_annotation_cache": [{"cachetype": "refseq", "dbkey": "ci3", "name": "Ciona intestinalis ci3 (V106, Refseq)", "path": "./ciona_intestinalis_refseq_vep_106_KH", "species": "ciona_intestinalis", "value": "ciona_intestinalis_refseq_vep_106_KH", "version": "106"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vep_versioned_annotation_cache.loc	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,6 @@
+#<value>									<dbkey>			<version>		<cachetype>		<name>		<species>						<path>
+#
+ciona_intestinalis_refseq_vep_105_KH	ci3	105	refseq	Ciona intestinalis ci3 (V105, Refseq)	ciona_intestinalis	/home/sebastian/galaxy/tool-data/vep/105/ci3/refseq
+caenorhabditis_elegans_vep_105_WBcel235	ce11	105	default	Caenorhabditis elegans ce11 (V105)	caenorhabditis_elegans	/home/sebastian/galaxy/tool-data/vep/105/ce11/default
+caenorhabditis_elegans_vep_104_WBcel235	ce11	104	default	Caenorhabditis elegans ce11 (V104)	caenorhabditis_elegans	/home/sebastian/galaxy/tool-data/vep/104/ce11/default
+drosophila_melanogaster_vep_105_BDGP6.32	dm6	105	default	Drosophila melanogaster dm6 (V105)	drosophila_melanogaster	/home/sebastian/galaxy/tool-data/vep/105/dm6/default
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/dbkeys.loc.sample	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,1 @@
+#<dbkey>		<display_name>	<len_file_path>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/vep_versioned_annotation_cache.loc.sample	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,11 @@
+#This file describes vep cache data and its metadata available on the server.
+#The data table has the format (white space characters are TAB characters):
+#
+#<value>									<dbkey>			<version>		<cachetype>		<name>								<species>			<path>
+#
+#So, vep_versioned_annotation_cache.loc tables could look like this:
+#
+#homo_sapiens_vep_105_GRCh38				hg38			105				default			Homo sapiens hg38 (V105)			homo_sapiens		/path/to/vep_versioned_annotation_cache/105/hg38/default
+#homo_sapiens_refseq_vep_105_GRCh38			hg38			105				refseq			Homo sapiens hg38 (V105, Refseq)	homo_sapiens		/path/to/vep_versioned_annotation_cache/105/hg38/refseq
+#homo_sapiens_merged_vep_105_GRCh38			hg38			105				merged			Homo sapiens hg38 (V105, Merged)	homo_sapiens		/path/to/vep_versioned_annotation_cache/105/hg38/merged
+#
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Table of installed versioned vep cache data -->
+    <table name="vep_versioned_annotation_cache" comment_char="#">
+        <columns>value, dbkey, version, cachetype, name, species, path</columns>
+        <file path="tool-data/vep_versioned_annotation_cache.loc" />
+    </table>
+    <!-- Locations of dbkeys and len files under genome directory -->
+    <table name="__dbkeys__" comment_char="#">
+        <columns>value, name, len_path</columns>
+        <file path="tool-data/dbkeys.loc" />
+    </table>
+</tables>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Wed May 11 13:03:06 2022 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Table of installed versioned vep cache data -->
+    <table name="vep_versioned_annotation_cache" comment_char="#">
+        <columns>value, dbkey, version, cachetype, name, species, path</columns>
+        <file path="${__HERE__}/test-data/vep_versioned_annotation_cache.loc" />
+    </table>
+    <!-- Locations of dbkeys and len files under genome directory -->
+    <table name="__dbkeys__" comment_char="#">
+        <columns>value, name, len_path</columns>
+        <file path="${__HERE__}/test-data/dbkeys.loc" />
+    </table>
+</tables>
\ No newline at end of file