# HG changeset patch
# User saket-choudhary
# Date 1413619393 14400
# Node ID 3645d1bcc7bbc7254e0fa8fa165fa038c75e07f4
# Parent de145ceb3ac09dda0c8fd6806a4d658fc82c3342
Uploaded
diff -r de145ceb3ac0 -r 3645d1bcc7bb tool_dependencies.xml
--- a/tool_dependencies.xml Sat Oct 18 03:46:53 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,31 +0,0 @@
-
-
-
-
- This Galaxy Tool shed package installs pyvcf(https://pypi.python.org/pypi/PyVCF)
- package.
-
- The corresponding PYTHONPATH is accessible via PYTHONPATH_PYVCF
-
- Developmental version is hosted on Github: https://github.com/saketkc/galaxy_tools/packages/package_pyvcf_0_6_7/
-
-
-
- https://pypi.python.org/packages/source/P/PyVCF/PyVCF-0.6.7.tar.gz
- $INSTALL_DIR/lib/python
-
- export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python &&
- python setup.py install --install-lib $INSTALL_DIR/lib/python
-
-
- $INSTALL_DIR/lib/python
- $INSTALL_DIR/lib/python
-
-
-
-
-
-
-
-
-
diff -r de145ceb3ac0 -r 3645d1bcc7bb vep_rest/test-data/vep_input.vcf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/test-data/vep_input.vcf Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,40 @@
+##fileformat=VCFv4.1
+##source=COSMICv70
+##reference=GRCh37
+##fileDate=20140805
+##comment="Missing nucleotide details indicate ambiguity during curation process"
+##comment="URL stub for COSM ID field (use numeric portion of ID)='http://cancer.sanger.ac.uk/cosmic/mutation/overview?id='"
+##comment="REF and ALT sequences are both forward strand
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+#CHROM POS ID REF ALT QUAL FILTER INFO
+1 69345 COSM911918 C A . . GENE=OR4F5;STRAND=+;CDS=c.255C>A;AA=p.I85I;CNT=1
+1 69523 COSM426644 G T . . GENE=OR4F5;STRAND=+;CDS=c.433G>T;AA=p.G145C;CNT=1
+1 69538 COSM75742 G A . . GENE=OR4F5;STRAND=+;CDS=c.448G>A;AA=p.V150M;CNT=1
+1 69539 COSM1343690 T C . . GENE=OR4F5;STRAND=+;CDS=c.449T>C;AA=p.V150A;CNT=1
+1 69540 COSM1560546 G T . . GENE=OR4F5;STRAND=+;CDS=c.450G>T;AA=p.V150V;CNT=1
+1 69569 COSM1599955 T C . . GENE=OR4F5;STRAND=+;CDS=c.479T>C;AA=p.L160P;CNT=2
+1 69591 COSM3419425 C T . . GENE=OR4F5;STRAND=+;CDS=c.501C>T;AA=p.V167V;CNT=1
+1 861390 COSM460103 G C . . GENE=SAMD11;STRAND=+;CDS=c.69G>C;AA=p.P23P;CNT=1
+1 865609 COSM336143 C T . . GENE=SAMD11;STRAND=+;CDS=c.147C>T;AA=p.P49P;CNT=1
+1 865617 COSM3790304 C G . . GENE=SAMD11;STRAND=+;CDS=c.155C>G;AA=p.S52C;CNT=1
+1 865624 COSM912740 C T . . GENE=SAMD11;STRAND=+;CDS=c.162C>T;AA=p.S54S;CNT=1
+1 865658 COSM364168 G T . . GENE=SAMD11;STRAND=+;CDS=c.196G>T;AA=p.G66W;CNT=1
+1 865691 COSM1686856 C T . . GENE=SAMD11;STRAND=+;CDS=c.229C>T;AA=p.P77S;CNT=1
+1 865716 COSM1735520 G A . . GENE=SAMD11;STRAND=+;CDS=c.254G>A;AA=p.R85K;CNT=1
+1 866438 COSM3386379 G A . . GENE=SAMD11;STRAND=+;CDS=c.274G>A;AA=p.V92M;CNT=1
+1 871165 COSM3711402 C A . . GENE=SAMD11;STRAND=+;CDS=c.319C>A;AA=p.L107I;CNT=1
+1 871217 COSM3667588 A C . . GENE=SAMD11;STRAND=+;CDS=c.371A>C;AA=p.E124A;CNT=1
+1 871255 COSM414754 G A . . GENE=SAMD11;STRAND=+;CDS=c.409G>A;AA=p.E137K;CNT=1
+1 874447 COSM178082 G A . . GENE=SAMD11;STRAND=+;CDS=c.458G>A;AA=p.R153H;CNT=1
+1 874456 COSM178083 G C . . GENE=SAMD11;STRAND=+;CDS=c.467G>C;AA=p.R156P;CNT=1
+1 874465 COSM112049 G GC . . GENE=SAMD11;STRAND=+;CDS=c.476_477insC;AA=p.D160fs*47;CNT=1
+1 874497 COSM912847 G A . . GENE=SAMD11;STRAND=+;CDS=c.508G>A;AA=p.E170K;CNT=1
+1 874501 COSM912848 C T . . GENE=SAMD11;STRAND=+;CDS=c.512C>T;AA=p.S171L;CNT=1
+1 874504 COSM1659453 C G . . GENE=SAMD11;STRAND=+;CDS=c.515C>G;AA=p.P172R;CNT=1
+1 874778 COSM1344642 GCCTCCCCAGCCACGGTGAGGACCCACCCTGGCATGATCCCCCTCATCA G . . GENE=SAMD11;STRAND=+;CDS=c.645_692del48;AA=p.G220_H235del16;CNT=2
+1 874781 COSM1344643 T TC . . GENE=SAMD11;STRAND=+;CDS=c.647_648insC;AA=p.S218fs*4;CNT=1
+1 874816 COSM1344644 C CT . . GENE=SAMD11;STRAND=+;CDS=c.682_683insT;AA=p.P228fs*227;CNT=3
diff -r de145ceb3ac0 -r 3645d1bcc7bb vep_rest/test-data/vep_output.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/test-data/vep_output.txt Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,5 @@
+ENSP00000393181 S52C,G66W
+ENSP00000471152 G45R,R42T,A40T
+ENSP00000411579 S52C,G66W
+ENSP00000342313 S52C,G66W
+ENSP00000334393 G145C,V150M,V150A,L160P
diff -r de145ceb3ac0 -r 3645d1bcc7bb vep_rest/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/tool_dependencies.xml Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff -r de145ceb3ac0 -r 3645d1bcc7bb vep_rest/vep_rest.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/vep_rest.py Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+"""
+Script to interact with Ensemble Variant Effect Predictor(VEP)
+webservice
+
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Saket Choudhary
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+"""
+import argparse
+import requests
+import sys
+import time
+import vcf
+
+URL = 'http://grch37.rest.ensembl.org/vep/human/region/{}:{}-{}/{}?content-type=application/json&protein=1'
+
+class VEPRestClient:
+
+ def __init__(self, input_file, output_file):
+ self.pending_urls = []
+ vcf_reader = vcf.Reader(open(input_file, 'r'))
+ self.output_file = output_file
+ for record in vcf_reader:
+ url = URL.format(record.CHROM, record.POS, record.POS, ("").join([str(x) for x in record.ALT]))
+ key = "{}:{}-{}-{}".format(record.CHROM, record.POS, record.POS, ("").join([str(x) for x in record.ALT]))
+ self.pending_urls.append((key, url))
+
+ def submit(self):
+ protein_variants = {}
+ for record in self.pending_urls:
+ vcf_key = record[0]
+ url = record[1]
+ request = requests.get(url)
+ time_delay = None
+ try:
+ retry_delay = request.headers['Retry-After']
+ time_delay = retry_delay
+ except KeyError:
+ pass
+ response = None
+ if time_delay:
+ time.sleep(time_delay)
+ request = requests.get(url)
+ try:
+ response = request.json()[0]
+ except Exception as e:
+ #TODO Better error handling
+ print e
+ if not response:
+ continue
+ variants = response['transcript_consequences']
+ consequence = ""
+ for variant in variants:
+ consequence = ""
+ protein_id = None
+ protein_start = None
+ try:
+ protein_id = variant['protein_id']
+ except KeyError:
+ pass
+ try:
+ protein_start = variant['protein_start']
+ except KeyError:
+ pass
+ if protein_id:
+ if protein_id.startswith('ENSP'):
+ if variant['protein_id'] not in protein_variants.keys():
+ protein_variants[protein_id] = []
+ consequence += protein_id
+ if protein_start:
+ try:
+ #TODO Better error handling
+ amino_acid_original, amino_acid_substituted = variant['amino_acids'].split("/")
+ substitution = amino_acid_original + str(protein_start) + amino_acid_substituted
+ if "X" not in substitution:
+ protein_variants[variant['protein_id']].append(substitution)
+ consequence += " ," + substitution
+ except:
+ pass
+
+ output = ""
+ for key, value in protein_variants.iteritems():
+ if len(value)>0:
+ output += "{} {}\n".format(key, (",").join(value))
+
+ with open(self.output_file, 'wb') as f:
+ f.write(output)
+
+
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input_file", type=str, required=True, help="Input file location")
+ parser.add_argument("--output_file", type=str, required=True, help="Output file location")
+ args = parser.parse_args(sys.argv[1:])
+ vep = VEPRestClient(args.input_file, args.output_file)
+ vep.submit()
+
diff -r de145ceb3ac0 -r 3645d1bcc7bb vep_rest/vep_rest.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vep_rest/vep_rest.xml Sat Oct 18 04:03:13 2014 -0400
@@ -0,0 +1,55 @@
+
+ VEP Web Service
+
+ requests
+ requests
+ pyvcf
+ pyvcf
+
+
+ vep_rest.py --input_file $input --output_file $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ **What it does**
+
+ This script calls VEP Rest webserice for GRCh37(http://grch37.rest.ensembl.org/) to fetch
+ consequences of variations in the proteins ONLY. Variations in transcripts are IGNORED.
+
+ Input is a VCF file.[http://samtools.github.io/hts-specs/VCFv4.2.pdf]
+
+ Output is a text file with each line beginning with Protein identifier followed by comma separated substituions.
+ Example:
+
+ ENSP00000393181, S52C,G66W,P77S,R85K,V92M,L107I
+ ENSP00000471152, G45R,R42T,A40T,G19E,L11F,T3M
+ ENSP00000411579, S52C,G66W,P77S,R85K,V92M,L107I,E124A,E137K,R153H,R156P,E170K,S171L,P172R
+ ENSP00000349216, R9K,V16M,L31I,E48A,E61K,R77H,R80P,E94K,S95L,P96R
+ ENSP00000342313, S52C,G66W,P77S,R85K,V92M,L107I,E124A,E137K,R153H,R156P,E170K,S171L,P172R
+
+
+ **Citations**
+
+ If you use this tool in Galaxy, please cite :
+ McLaren W, Pritchard B, Rios D, Chen Y, Flicek P, Cunningham F.
+ Deriving the consequences of genomic variants with the Ensembl API and SNP Effect Predictor.
+ Bioinformatics 26(16):2069-70(2010)
+ doi:10.1093/bioinformatics/btq330
+
+
+
+
+