changeset 1:3645d1bcc7bb draft default tip

Uploaded
author saket-choudhary
date Sat, 18 Oct 2014 04:03:13 -0400
parents de145ceb3ac0
children
files tool_dependencies.xml vep_rest/test-data/vep_input.vcf vep_rest/test-data/vep_output.txt vep_rest/tool_dependencies.xml vep_rest/vep_rest.py vep_rest/vep_rest.xml
diffstat 6 files changed, 227 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/tool_dependencies.xml	Sat Oct 18 03:46:53 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,31 +0,0 @@
-<?xml version='1.0' encoding='utf-8'?>
-<tool_dependency>
-    <package name="pyvcf" version="0.6.7">
-        <readme>
-            This Galaxy Tool shed package installs pyvcf(https://pypi.python.org/pypi/PyVCF)
-            package.
-
-            The corresponding PYTHONPATH is accessible via PYTHONPATH_PYVCF
-
-            Developmental version is hosted on Github:  https://github.com/saketkc/galaxy_tools/packages/package_pyvcf_0_6_7/
-        </readme>
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">https://pypi.python.org/packages/source/P/PyVCF/PyVCF-0.6.7.tar.gz</action>
-                <action type="make_directory">$INSTALL_DIR/lib/python</action>
-                <action type="shell_command">
-                    export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python &amp;&amp;
-                    python setup.py install --install-lib $INSTALL_DIR/lib/python
-                </action>
-                <action type="set_environment">
-                    <environment_variable action="append_to" name="PYTHONPATH">$INSTALL_DIR/lib/python</environment_variable>
-                    <environment_variable action="set_to" name="PYTHONPATH_PYVCF">$INSTALL_DIR/lib/python</environment_variable>
-                </action>
-            </actions>
-        </install>
-    </package>
-</tool_dependency>
-
-
-
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/test-data/vep_input.vcf	Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,40 @@
+##fileformat=VCFv4.1
+##source=COSMICv70
+##reference=GRCh37
+##fileDate=20140805
+##comment="Missing nucleotide details indicate ambiguity during curation process"
+##comment="URL stub for COSM ID field (use numeric portion of ID)='http://cancer.sanger.ac.uk/cosmic/mutation/overview?id='"
+##comment="REF and ALT sequences are both forward strand
+##INFO=<ID=GENE,Number=1,Type=String,Description="Gene name">
+##INFO=<ID=STRAND,Number=1,Type=String,Description="Gene strand">
+##INFO=<ID=CDS,Number=1,Type=String,Description="CDS annotation">
+##INFO=<ID=AA,Number=1,Type=String,Description="Peptide annotation">
+##INFO=<ID=CNT,Number=1,Type=Integer,Description="How many samples have this mutation">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	69345	COSM911918	C	A	.	.	GENE=OR4F5;STRAND=+;CDS=c.255C>A;AA=p.I85I;CNT=1
+1	69523	COSM426644	G	T	.	.	GENE=OR4F5;STRAND=+;CDS=c.433G>T;AA=p.G145C;CNT=1
+1	69538	COSM75742	G	A	.	.	GENE=OR4F5;STRAND=+;CDS=c.448G>A;AA=p.V150M;CNT=1
+1	69539	COSM1343690	T	C	.	.	GENE=OR4F5;STRAND=+;CDS=c.449T>C;AA=p.V150A;CNT=1
+1	69540	COSM1560546	G	T	.	.	GENE=OR4F5;STRAND=+;CDS=c.450G>T;AA=p.V150V;CNT=1
+1	69569	COSM1599955	T	C	.	.	GENE=OR4F5;STRAND=+;CDS=c.479T>C;AA=p.L160P;CNT=2
+1	69591	COSM3419425	C	T	.	.	GENE=OR4F5;STRAND=+;CDS=c.501C>T;AA=p.V167V;CNT=1
+1	861390	COSM460103	G	C	.	.	GENE=SAMD11;STRAND=+;CDS=c.69G>C;AA=p.P23P;CNT=1
+1	865609	COSM336143	C	T	.	.	GENE=SAMD11;STRAND=+;CDS=c.147C>T;AA=p.P49P;CNT=1
+1	865617	COSM3790304	C	G	.	.	GENE=SAMD11;STRAND=+;CDS=c.155C>G;AA=p.S52C;CNT=1
+1	865624	COSM912740	C	T	.	.	GENE=SAMD11;STRAND=+;CDS=c.162C>T;AA=p.S54S;CNT=1
+1	865658	COSM364168	G	T	.	.	GENE=SAMD11;STRAND=+;CDS=c.196G>T;AA=p.G66W;CNT=1
+1	865691	COSM1686856	C	T	.	.	GENE=SAMD11;STRAND=+;CDS=c.229C>T;AA=p.P77S;CNT=1
+1	865716	COSM1735520	G	A	.	.	GENE=SAMD11;STRAND=+;CDS=c.254G>A;AA=p.R85K;CNT=1
+1	866438	COSM3386379	G	A	.	.	GENE=SAMD11;STRAND=+;CDS=c.274G>A;AA=p.V92M;CNT=1
+1	871165	COSM3711402	C	A	.	.	GENE=SAMD11;STRAND=+;CDS=c.319C>A;AA=p.L107I;CNT=1
+1	871217	COSM3667588	A	C	.	.	GENE=SAMD11;STRAND=+;CDS=c.371A>C;AA=p.E124A;CNT=1
+1	871255	COSM414754	G	A	.	.	GENE=SAMD11;STRAND=+;CDS=c.409G>A;AA=p.E137K;CNT=1
+1	874447	COSM178082	G	A	.	.	GENE=SAMD11;STRAND=+;CDS=c.458G>A;AA=p.R153H;CNT=1
+1	874456	COSM178083	G	C	.	.	GENE=SAMD11;STRAND=+;CDS=c.467G>C;AA=p.R156P;CNT=1
+1	874465	COSM112049	G	GC	.	.	GENE=SAMD11;STRAND=+;CDS=c.476_477insC;AA=p.D160fs*47;CNT=1
+1	874497	COSM912847	G	A	.	.	GENE=SAMD11;STRAND=+;CDS=c.508G>A;AA=p.E170K;CNT=1
+1	874501	COSM912848	C	T	.	.	GENE=SAMD11;STRAND=+;CDS=c.512C>T;AA=p.S171L;CNT=1
+1	874504	COSM1659453	C	G	.	.	GENE=SAMD11;STRAND=+;CDS=c.515C>G;AA=p.P172R;CNT=1
+1	874778	COSM1344642	GCCTCCCCAGCCACGGTGAGGACCCACCCTGGCATGATCCCCCTCATCA	G	.	.	GENE=SAMD11;STRAND=+;CDS=c.645_692del48;AA=p.G220_H235del16;CNT=2
+1	874781	COSM1344643	T	TC	.	.	GENE=SAMD11;STRAND=+;CDS=c.647_648insC;AA=p.S218fs*4;CNT=1
+1	874816	COSM1344644	C	CT	.	.	GENE=SAMD11;STRAND=+;CDS=c.682_683insT;AA=p.P228fs*227;CNT=3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/test-data/vep_output.txt	Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,5 @@
+ENSP00000393181   S52C,G66W
+ENSP00000471152   G45R,R42T,A40T
+ENSP00000411579   S52C,G66W
+ENSP00000342313   S52C,G66W
+ENSP00000334393   G145C,V150M,V150A,L160P
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/tool_dependencies.xml	Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="requests" version="2.2.1">
+        <repository changeset_revision="04c9eef6c14b" name="package_requests_2_2_1" owner="saket-choudhary" toolshed="http://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="c05e29a21f10" name="package_pyvcf_0_6_7" owner="saket-choudhary" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/vep_rest.py	Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+"""
+Script to interact with Ensemble Variant Effect Predictor(VEP)
+webservice
+
+
+The MIT License (MIT)
+
+Copyright (c) 2014  Saket Choudhary<saketkc@gmail.com, skchoudh@usc.edu>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+"""
+import argparse
+import requests
+import sys
+import time
+import vcf
+
+URL = 'http://grch37.rest.ensembl.org/vep/human/region/{}:{}-{}/{}?content-type=application/json&protein=1'
+
+class VEPRestClient:
+
+    def __init__(self, input_file, output_file):
+        self.pending_urls = []
+        vcf_reader = vcf.Reader(open(input_file, 'r'))
+        self.output_file = output_file
+        for record in vcf_reader:
+            url = URL.format(record.CHROM, record.POS, record.POS, ("").join([str(x) for x in record.ALT]))
+            key = "{}:{}-{}-{}".format(record.CHROM, record.POS, record.POS, ("").join([str(x) for x in record.ALT]))
+            self.pending_urls.append((key, url))
+
+    def submit(self):
+        protein_variants = {}
+        for record in self.pending_urls:
+            vcf_key = record[0]
+            url = record[1]
+            request = requests.get(url)
+            time_delay = None
+            try:
+                retry_delay = request.headers['Retry-After']
+                time_delay = retry_delay
+            except KeyError:
+                pass
+            response = None
+            if time_delay:
+                time.sleep(time_delay)
+                request = requests.get(url)
+            try:
+                response = request.json()[0]
+            except Exception as e:
+                #TODO Better error handling
+                print e
+            if not response:
+                continue
+            variants = response['transcript_consequences']
+            consequence = ""
+            for variant in variants:
+                consequence  = ""
+                protein_id = None
+                protein_start = None
+                try:
+                    protein_id  = variant['protein_id']
+                except KeyError:
+                    pass
+                try:
+                    protein_start = variant['protein_start']
+                except KeyError:
+                    pass
+                if protein_id:
+                    if protein_id.startswith('ENSP'):
+                        if variant['protein_id'] not in protein_variants.keys():
+                            protein_variants[protein_id] = []
+                            consequence += protein_id
+                        if protein_start:
+                            try:
+                                #TODO Better error handling
+                                amino_acid_original, amino_acid_substituted = variant['amino_acids'].split("/")
+                                substitution = amino_acid_original + str(protein_start) + amino_acid_substituted
+                                if "X" not  in substitution:
+                                    protein_variants[variant['protein_id']].append(substitution)
+                                    consequence += "  ," + substitution
+                            except:
+                                pass
+
+        output = ""
+        for key, value in protein_variants.iteritems():
+            if len(value)>0:
+                output += "{}   {}\n".format(key, (",").join(value))
+
+        with open(self.output_file, 'wb') as f:
+            f.write(output)
+
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", type=str, required=True, help="Input file location")
+    parser.add_argument("--output_file", type=str, required=True, help="Output file location")
+    args = parser.parse_args(sys.argv[1:])
+    vep = VEPRestClient(args.input_file, args.output_file)
+    vep.submit()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/vep_rest.xml	Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,55 @@
+<tool id="vep_rest" name="VEP Rest">
+    <description>VEP Web Service</description>
+    <requirements>
+        <requirement type="package" version="2.2.1">requests</requirement>
+        <requirement type="python-module">requests</requirement>
+        <requirement type="package" version="2.2.1">pyvcf</requirement>
+        <requirement type="python-module">pyvcf</requirement>
+    </requirements>
+    <command interpreter="python">
+        vep_rest.py --input_file $input --output_file $output
+    </command>
+    <inputs>
+        <param name="input" format="vcf" type="data" label="Input variants" />
+    </inputs>
+    <outputs>
+        <data name="output" format="txt"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="vep_input.vcf"/>
+            <output name="output" file="vep_output.txt"/>
+        </test>
+    </tests>
+    <help>
+
+
+    **What it does**
+
+        This script calls VEP Rest webserice for GRCh37(http://grch37.rest.ensembl.org/) to fetch
+        consequences of  variations in the proteins ONLY. Variations in transcripts are IGNORED.
+
+        Input is a VCF file.[http://samtools.github.io/hts-specs/VCFv4.2.pdf]
+
+        Output is a text file with each line beginning with Protein identifier followed by comma separated substituions.
+        Example:
+
+        ENSP00000393181, S52C,G66W,P77S,R85K,V92M,L107I
+        ENSP00000471152, G45R,R42T,A40T,G19E,L11F,T3M
+        ENSP00000411579, S52C,G66W,P77S,R85K,V92M,L107I,E124A,E137K,R153H,R156P,E170K,S171L,P172R
+        ENSP00000349216, R9K,V16M,L31I,E48A,E61K,R77H,R80P,E94K,S95L,P96R
+        ENSP00000342313, S52C,G66W,P77S,R85K,V92M,L107I,E124A,E137K,R153H,R156P,E170K,S171L,P172R
+
+
+    **Citations**
+
+        If you use this tool in Galaxy, please cite :
+            McLaren W, Pritchard B, Rios D, Chen Y, Flicek P, Cunningham F.
+            Deriving the consequences of genomic variants with the Ensembl API and SNP Effect Predictor.
+            Bioinformatics 26(16):2069-70(2010)
+            doi:10.1093/bioinformatics/btq330
+
+
+    </help>
+</tool>
+