annotate cravatp_submit.py @ 1:2c7bcc1219fc draft

Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
author galaxyp
date Thu, 16 Aug 2018 12:27:35 -0400
parents
children a018c44dc18b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
1 # -*- coding: utf-8 -*-
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
2 #
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
3 # Author: Ray W. Sajulga
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
4 #
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
5 #
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
6
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
7 import requests # pipenv requests
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
8 import json
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
9 import time
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
10 import urllib
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
11 import sys
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
12 import csv
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
13 import re
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
14 import math
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
15 import argparse
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
16 from xml.etree import ElementTree as ET
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
17 from zipfile import ZipFile
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
18 try: #Python 3
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
19 from urllib.request import urlopen
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
20 except ImportError: #Python 2
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
21 from urllib2 import urlopen
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
22 from io import BytesIO
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
23
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
24 # initializes blank parameters
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
25 chasm_classifier = ''
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
26 probed_filename = None
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
27 intersected_only = False
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
28 vcf_output = None
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
29 analysis_type = None
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
30
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
31 # # Testing Command
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
32 # python cravatp_submit.py test-data/Freebayes_two-variants.vcf GRCh38
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
33 # test-data/variant.tsv test-data/gene.tsv test-data/noncoding.tsv
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
34 # test-data/error.tsv CHASM -—classifier Breast -—proBED
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
35 # test-data/MCF7_proBed.bed
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
36 parser = argparse.ArgumentParser()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
37 parser.add_argument('cravatInput',help='The filename of the input '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
38 'CRAVAT-formatted tabular file '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
39 '(e.g., VCF)')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
40 parser.add_argument('GRCh', help='The name of the human reference '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
41 'genome used for annotation: '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
42 'GRCh38/hg38 or GRCh37/hg19')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
43 parser.add_argument('variant', help='The filename of the output '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
44 'variant file')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
45 parser.add_argument('gene', help='The filename of the output gene '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
46 'variant report')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
47 parser.add_argument('noncoding', help='The filename of the output '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
48 'non-coding variant report')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
49 parser.add_argument('error', help='The filename of the output error '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
50 'file')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
51 parser.add_argument('analysis', help='The machine-learning algorithm '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
52 'used for CRAVAT annotation (VEST'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
53 ' and/or CHASM)')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
54 parser.add_argument('--classifier', help='The cancer classifier for the'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
55 ' CHASM algorithm')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
56 parser.add_argument('--proBED', help='The filename of the proBED file '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
57 'containing peptides with genomic '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
58 'coordinates')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
59 parser.add_argument('--intersectOnly', help='Specifies whether to '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
60 'analyze only variants '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
61 'intersected between the '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
62 'CRAVAT input and proBED '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
63 'file')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
64 parser.add_argument('--vcfOutput', help='The output filename of the '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
65 'intersected VCF file')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
66
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
67 # assigns parsed arguments to appropriate variables
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
68 args = parser.parse_args()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
69 input_filename = args.cravatInput
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
70 GRCh_build = args.GRCh
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
71 output_filename = args.variant
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
72 file_3 = args.gene
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
73 file_4 = args.noncoding
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
74 file_5 = args.error
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
75 if args.analysis != 'None':
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
76 analysis_type = args.analysis
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
77 if args.classifier:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
78 chasm_classifier = args.classifier
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
79 if args.proBED:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
80 probed_filename = args.proBED
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
81 if args.intersectOnly:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
82 intersected_only = args.intersectOnly
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
83 if args.vcfOutput:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
84 vcf_output = args.vcfOutput
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
85
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
86 if analysis_type and '+' in analysis_type:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
87 analysis_type = 'CHASM;VEST'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
88
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
89 # obtains the transcript's protein sequence using Ensembl API
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
90 def getSequence(transcript_id):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
91 server = 'http://rest.ensembl.org'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
92 ext = ('/sequence/id/' + transcript_id
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
93 + '?content-type=text/x-seqxml%2Bxml;'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
94 'multiple_sequences=1;type=protein')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
95 req = requests.get(server+ext,
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
96 headers={ "Content-Type" : "text/plain"})
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
97
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
98 if not req.ok:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
99 return None
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
100
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
101 root = ET.fromstring(req.content)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
102 for child in root.iter('AAseq'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
103 return child.text
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
104
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
105 # parses the proBED file as a list.
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
106 def loadProBED():
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
107 proBED = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
108 with open(probed_filename) as tsvin:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
109 tsvreader = csv.reader(tsvin, delimiter='\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
110 for i, row in enumerate(tsvreader):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
111 proBED.append(row)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
112 return proBED
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
113
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
114 write_header = True
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
115
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
116
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
117 # Creates an VCF file that only contains variants that overlap with the
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
118 # proteogenomic input (proBED) file if the user specifies that they want
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
119 # to only include intersected variants or if they want to receive the
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
120 # intersected VCF as well.
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
121 if probed_filename and (vcf_output or intersected_only == 'true'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
122 proBED = loadProBED()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
123 if not vcf_output:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
124 vcf_output = 'intersected_input.vcf'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
125 with open(input_filename) as tsvin, open(vcf_output, 'wb') as tsvout:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
126 tsvreader = csv.reader(tsvin, delimiter='\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
127 tsvout = csv.writer(tsvout, delimiter='\t', escapechar=' ',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
128 quoting=csv.QUOTE_NONE)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
129
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
130 for row in tsvreader:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
131 if row == [] or row[0][0] == '#':
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
132 tsvout.writerow(row)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
133 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
134 genchrom = row[0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
135 genpos = int(row[1])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
136
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
137 for peptide in proBED:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
138 pepchrom = peptide[0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
139 pepposA = int(peptide[1])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
140 pepposB = int(peptide[2])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
141 if (genchrom == pepchrom and
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
142 pepposA <= genpos and
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
143 genpos <= pepposB):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
144 tsvout.writerow(row)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
145 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
146 if intersected_only == 'true':
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
147 input_filename = vcf_output
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
148
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
149 # sets up the parameters for submission to the CRAVAT API
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
150 parameters = {'email':'rsajulga@umn.edu',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
151 'hg19': 'on' if GRCh_build == 'GRCh37' else 'off',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
152 'functionalannotation': 'on',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
153 'tsvreport' : 'on',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
154 'mupitinput' : 'on'}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
155 if analysis_type:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
156 parameters['analyses'] = analysis_type
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
157 if chasm_classifier:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
158 parameters['chasmclassifier'] = chasm_classifier
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
159
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
160 # plugs in params to given URL
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
161 submit = requests.post('http://www.cravat.us/CRAVAT/rest/service/submit',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
162 files = {'inputfile':open(input_filename)},
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
163 data = parameters)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
164
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
165 # makes the data a json dictionary; takes out only the job ID
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
166 jobid = json.loads(submit.text)['jobid']
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
167
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
168 # loops until we find a status equal to Success, then breaks
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
169 while True:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
170 check = requests.get(
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
171 'http://www.cravat.us/CRAVAT/rest/service/status',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
172 params = {'jobid' : jobid})
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
173 status = json.loads(check.text)['status']
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
174 resultfileurl = json.loads(check.text)['resultfileurl']
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
175 #out_file.write(str(status) + ', ')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
176 if status == 'Success':
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
177 #out_file.write('\t' + resultfileurl)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
178 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
179 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
180 time.sleep(2)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
181
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
182 # obtains the zipfile created by CRAVAT and loads the variants and VAD
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
183 # file for processing
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
184 r = requests.get(resultfileurl, stream=True)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
185 url = urlopen(resultfileurl)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
186 zipfile = ZipFile(BytesIO(r.content))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
187 variants = zipfile.open(jobid + '/Variant.Result.tsv').readlines()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
188 vad = zipfile.open(jobid + '/Variant_Additional_Details.Result.tsv').readlines()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
189
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
190 # reads and writes the gene, noncoding, and error files
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
191 open(file_3, 'wb').write(zipfile.read(jobid + '/Gene_Level_Analysis.Result.tsv'))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
192 open(file_4, 'wb').write(zipfile.read(jobid + '/Variant_Non-coding.Result.tsv'))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
193 open(file_5, 'wb').write(zipfile.read(jobid + '/Input_Errors.Result.tsv'))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
194
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
195
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
196
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
197 if probed_filename and not vcf_output:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
198 proBED = loadProBED()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
199
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
200 if probed_filename:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
201 with open(output_filename, 'w') as tsvout:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
202 tsvout = csv.writer(tsvout,
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
203 delimiter='\t',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
204 escapechar=' ',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
205 quoting=csv.QUOTE_NONE)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
206 n = 11 #Index for proteogenomic column start
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
207 reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
208 SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
209 pep_muts = {}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
210 pep_map = {}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
211 rows = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
212 for row in vad:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
213 row = row.decode().split('\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
214 row[-1] = row[-1].replace('\n','')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
215 if row and row[0] and not row[0].startswith('#'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
216 # checks if the row begins with input line
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
217 if row[0].startswith('Input line'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
218 vad_headers = row
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
219
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
220 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
221 # Initially screens through the output Variant
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
222 # Additional Details to catch mutations on
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
223 # same peptide region
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
224 genchrom = row[vad_headers.index('Chromosome')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
225 genpos = int(row[vad_headers.index('Position')])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
226 aa_change = row[vad_headers.index('Protein sequence change')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
227 input_line = row[vad_headers.index('Input line')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
228
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
229 for peptide in proBED:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
230 pepseq = peptide[3]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
231 pepchrom = peptide[0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
232 pepposA = int(peptide[1])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
233 pepposB = int(peptide[2])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
234 if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
235 strand = row[vad_headers.index('Strand')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
236 transcript_strand = row[vad_headers.index('S.O. transcript strand')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
237
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
238 # Calculates the position of the variant
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
239 # amino acid(s) on peptide
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
240 if transcript_strand == strand:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
241 aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
242 if (strand == '-' or transcript_strand == '-'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
243 or aa_peppos >= len(pepseq)):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
244 aa_peppos = int(math.floor((pepposB - genpos)/3.0))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
245 if pepseq in pep_muts:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
246 if aa_change not in pep_muts[pepseq]:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
247 pep_muts[pepseq][aa_change] = [aa_peppos]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
248 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
249 if aa_peppos not in pep_muts[pepseq][aa_change]:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
250 pep_muts[pepseq][aa_change].append(aa_peppos)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
251 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
252 pep_muts[pepseq] = {aa_change : [aa_peppos]}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
253 # Stores the intersect information by mapping
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
254 # Input Line (CRAVAT output) to peptide sequence.
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
255 if input_line in pep_map:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
256 if pepseq not in pep_map[input_line]:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
257 pep_map[input_line].append(pepseq)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
258 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
259 pep_map[input_line] = [pepseq]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
260 # TODO: Need to obtain strand information as
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
261 # well i.e., positive (+) or negative (-)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
262
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
263
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
264 with open(output_filename, 'w') as tsvout:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
265 tsvout = csv.writer(tsvout, delimiter='\t', escapechar='',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
266 quoting=csv.QUOTE_NONE)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
267 headers = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
268 duplicate_indices = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
269
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
270 # loops through each row in the Variant Additional Details (VAD) file
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
271 for x, row in enumerate(variants):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
272 row = row.decode().split('\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
273 row[-1] = row[-1].replace('\n','')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
274 # sets row_2 equal to the same row in Variant Result (VR) file
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
275 row_2 = vad[x].decode().split('\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
276 row_2[-1] = row_2[-1].replace('\n','')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
277
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
278 # checks if row is empty or if the first term contains '#'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
279 if not row or not row[0] or row[0].startswith('#'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
280 if row[0]:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
281 tsvout.writerow(row)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
282 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
283 if row[0].startswith('Input line'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
284 # goes through each value in the headers list in VAD
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
285 headers = row
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
286 # loops through the Keys in VR
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
287 for i,value in enumerate(row_2):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
288 #Checks if the value is already in headers
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
289 if value in headers:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
290 duplicate_indices.append(i)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
291 continue
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
292 #else adds the header to headers
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
293 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
294 headers.append(value)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
295 # adds appropriate headers when proteomic input is supplied
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
296 if probed_filename:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
297 headers.insert(n, 'Variant peptide')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
298 headers.insert(n, 'Reference peptide')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
299 tsvout.writerow(headers)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
300 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
301 cells = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
302 # goes through each value in the next list
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
303 for value in row:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
304 #adds it to cells
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
305 cells.append(value)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
306 # goes through each value from the VR file after position
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
307 # 11 (After it is done repeating from VAD file)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
308 for i,value in enumerate(row_2):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
309 # adds in the rest of the values to cells
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
310 if i not in duplicate_indices:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
311 # Skips the initial 11 columns and the
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
312 # VEST p-value (already in VR file)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
313 cells.append(value)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
314
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
315 # Verifies the peptides intersected previously through
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
316 # sequences obtained from Ensembl's API
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
317 if probed_filename:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
318 cells.insert(n,'')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
319 cells.insert(n,'')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
320 input_line = cells[headers.index('Input line')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
321 if input_line in pep_map:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
322 pepseq = pep_map[input_line][0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
323 aa_changes = pep_muts[pepseq]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
324 transcript_id = cells[headers.index('S.O. transcript')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
325 ref_fullseq = getSequence(transcript_id)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
326 # Checks the other S.O. transcripts if the primary
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
327 # S.O. transcript has no sequence available
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
328 if not ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
329 transcripts = cells[headers.index('S.O. all transcripts')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
330 for transcript in transcripts.split(','):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
331 if transcript:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
332 mat = SOtranscripts.search(transcript)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
333 ref_fullseq = getSequence(mat.group(1))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
334 if ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
335 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
336 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
337 # Resubmits the previous transcripts without
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
338 # extensions if all S.O. transcripts fail to
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
339 # provide a sequence
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
340 if not ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
341 transcripts = cells[headers.index('S.O. all transcripts')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
342 for transcript in transcripts.split(','):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
343 if transcript:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
344 mat = SOtranscripts.search(transcript)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
345 ref_fullseq = getSequence(mat.group(1).split('.')[0])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
346 if ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
347 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
348 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
349 if ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
350 # Sorts the amino acid changes
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
351 positions = {}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
352 for aa_change in aa_changes:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
353 m = reg_seq_change.search(aa_change)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
354 aa_protpos = int(m.group(2))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
355 aa_peppos = aa_changes[aa_change][0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
356 aa_startpos = aa_protpos - aa_peppos - 1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
357 if aa_startpos in positions:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
358 positions[aa_startpos].append(aa_change)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
359 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
360 positions[aa_startpos] = [aa_change]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
361 # Goes through the sorted categories to mutate the Ensembl peptide
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
362 # (uses proBED peptide as a reference)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
363 for pep_protpos in positions:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
364 ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
365 muts = positions[pep_protpos]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
366 options = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
367 mut_seq = ref_seq
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
368 for mut in muts:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
369 m = reg_seq_change.search(mut)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
370 ref_aa = m.group(1)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
371 mut_pos = int(m.group(2))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
372 alt_aa = m.group(3)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
373 pep_mutpos = mut_pos - pep_protpos - 1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
374 if (ref_seq[pep_mutpos] == ref_aa
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
375 and (pepseq[pep_mutpos] == alt_aa
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
376 or pepseq[pep_mutpos] == ref_aa)):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
377 if pepseq[pep_mutpos] == ref_aa:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
378 mut_seq = (mut_seq[:pep_mutpos] + ref_aa
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
379 + mut_seq[pep_mutpos+1:])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
380 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
381 mut_seq = (mut_seq[:pep_mutpos] + alt_aa
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
382 + mut_seq[pep_mutpos+1:])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
383 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
384 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
385 # Adds the mutated peptide and reference peptide if mutated correctly
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
386 if pepseq == mut_seq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
387 cells[n+1] = pepseq
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
388 cells[n] = ref_seq
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
389 tsvout.writerow(cells)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
390