comparison cravatp_submit.py @ 1:2c7bcc1219fc draft

Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
author galaxyp
date Thu, 16 Aug 2018 12:27:35 -0400
parents
children a018c44dc18b
comparison
equal deleted inserted replaced
0:83181dabeb90 1:2c7bcc1219fc
1 # -*- coding: utf-8 -*-
2 #
3 # Author: Ray W. Sajulga
4 #
5 #
6
7 import requests # pipenv requests
8 import json
9 import time
10 import urllib
11 import sys
12 import csv
13 import re
14 import math
15 import argparse
16 from xml.etree import ElementTree as ET
17 from zipfile import ZipFile
18 try: #Python 3
19 from urllib.request import urlopen
20 except ImportError: #Python 2
21 from urllib2 import urlopen
22 from io import BytesIO
23
24 # initializes blank parameters
25 chasm_classifier = ''
26 probed_filename = None
27 intersected_only = False
28 vcf_output = None
29 analysis_type = None
30
31 # # Testing Command
32 # python cravatp_submit.py test-data/Freebayes_two-variants.vcf GRCh38
33 # test-data/variant.tsv test-data/gene.tsv test-data/noncoding.tsv
34 # test-data/error.tsv CHASM -—classifier Breast -—proBED
35 # test-data/MCF7_proBed.bed
36 parser = argparse.ArgumentParser()
37 parser.add_argument('cravatInput',help='The filename of the input '
38 'CRAVAT-formatted tabular file '
39 '(e.g., VCF)')
40 parser.add_argument('GRCh', help='The name of the human reference '
41 'genome used for annotation: '
42 'GRCh38/hg38 or GRCh37/hg19')
43 parser.add_argument('variant', help='The filename of the output '
44 'variant file')
45 parser.add_argument('gene', help='The filename of the output gene '
46 'variant report')
47 parser.add_argument('noncoding', help='The filename of the output '
48 'non-coding variant report')
49 parser.add_argument('error', help='The filename of the output error '
50 'file')
51 parser.add_argument('analysis', help='The machine-learning algorithm '
52 'used for CRAVAT annotation (VEST'
53 ' and/or CHASM)')
54 parser.add_argument('--classifier', help='The cancer classifier for the'
55 ' CHASM algorithm')
56 parser.add_argument('--proBED', help='The filename of the proBED file '
57 'containing peptides with genomic '
58 'coordinates')
59 parser.add_argument('--intersectOnly', help='Specifies whether to '
60 'analyze only variants '
61 'intersected between the '
62 'CRAVAT input and proBED '
63 'file')
64 parser.add_argument('--vcfOutput', help='The output filename of the '
65 'intersected VCF file')
66
67 # assigns parsed arguments to appropriate variables
68 args = parser.parse_args()
69 input_filename = args.cravatInput
70 GRCh_build = args.GRCh
71 output_filename = args.variant
72 file_3 = args.gene
73 file_4 = args.noncoding
74 file_5 = args.error
75 if args.analysis != 'None':
76 analysis_type = args.analysis
77 if args.classifier:
78 chasm_classifier = args.classifier
79 if args.proBED:
80 probed_filename = args.proBED
81 if args.intersectOnly:
82 intersected_only = args.intersectOnly
83 if args.vcfOutput:
84 vcf_output = args.vcfOutput
85
86 if analysis_type and '+' in analysis_type:
87 analysis_type = 'CHASM;VEST'
88
89 # obtains the transcript's protein sequence using Ensembl API
90 def getSequence(transcript_id):
91 server = 'http://rest.ensembl.org'
92 ext = ('/sequence/id/' + transcript_id
93 + '?content-type=text/x-seqxml%2Bxml;'
94 'multiple_sequences=1;type=protein')
95 req = requests.get(server+ext,
96 headers={ "Content-Type" : "text/plain"})
97
98 if not req.ok:
99 return None
100
101 root = ET.fromstring(req.content)
102 for child in root.iter('AAseq'):
103 return child.text
104
105 # parses the proBED file as a list.
106 def loadProBED():
107 proBED = []
108 with open(probed_filename) as tsvin:
109 tsvreader = csv.reader(tsvin, delimiter='\t')
110 for i, row in enumerate(tsvreader):
111 proBED.append(row)
112 return proBED
113
114 write_header = True
115
116
117 # Creates an VCF file that only contains variants that overlap with the
118 # proteogenomic input (proBED) file if the user specifies that they want
119 # to only include intersected variants or if they want to receive the
120 # intersected VCF as well.
121 if probed_filename and (vcf_output or intersected_only == 'true'):
122 proBED = loadProBED()
123 if not vcf_output:
124 vcf_output = 'intersected_input.vcf'
125 with open(input_filename) as tsvin, open(vcf_output, 'wb') as tsvout:
126 tsvreader = csv.reader(tsvin, delimiter='\t')
127 tsvout = csv.writer(tsvout, delimiter='\t', escapechar=' ',
128 quoting=csv.QUOTE_NONE)
129
130 for row in tsvreader:
131 if row == [] or row[0][0] == '#':
132 tsvout.writerow(row)
133 else:
134 genchrom = row[0]
135 genpos = int(row[1])
136
137 for peptide in proBED:
138 pepchrom = peptide[0]
139 pepposA = int(peptide[1])
140 pepposB = int(peptide[2])
141 if (genchrom == pepchrom and
142 pepposA <= genpos and
143 genpos <= pepposB):
144 tsvout.writerow(row)
145 break
146 if intersected_only == 'true':
147 input_filename = vcf_output
148
149 # sets up the parameters for submission to the CRAVAT API
150 parameters = {'email':'rsajulga@umn.edu',
151 'hg19': 'on' if GRCh_build == 'GRCh37' else 'off',
152 'functionalannotation': 'on',
153 'tsvreport' : 'on',
154 'mupitinput' : 'on'}
155 if analysis_type:
156 parameters['analyses'] = analysis_type
157 if chasm_classifier:
158 parameters['chasmclassifier'] = chasm_classifier
159
160 # plugs in params to given URL
161 submit = requests.post('http://www.cravat.us/CRAVAT/rest/service/submit',
162 files = {'inputfile':open(input_filename)},
163 data = parameters)
164
165 # makes the data a json dictionary; takes out only the job ID
166 jobid = json.loads(submit.text)['jobid']
167
168 # loops until we find a status equal to Success, then breaks
169 while True:
170 check = requests.get(
171 'http://www.cravat.us/CRAVAT/rest/service/status',
172 params = {'jobid' : jobid})
173 status = json.loads(check.text)['status']
174 resultfileurl = json.loads(check.text)['resultfileurl']
175 #out_file.write(str(status) + ', ')
176 if status == 'Success':
177 #out_file.write('\t' + resultfileurl)
178 break
179 else:
180 time.sleep(2)
181
182 # obtains the zipfile created by CRAVAT and loads the variants and VAD
183 # file for processing
184 r = requests.get(resultfileurl, stream=True)
185 url = urlopen(resultfileurl)
186 zipfile = ZipFile(BytesIO(r.content))
187 variants = zipfile.open(jobid + '/Variant.Result.tsv').readlines()
188 vad = zipfile.open(jobid + '/Variant_Additional_Details.Result.tsv').readlines()
189
190 # reads and writes the gene, noncoding, and error files
191 open(file_3, 'wb').write(zipfile.read(jobid + '/Gene_Level_Analysis.Result.tsv'))
192 open(file_4, 'wb').write(zipfile.read(jobid + '/Variant_Non-coding.Result.tsv'))
193 open(file_5, 'wb').write(zipfile.read(jobid + '/Input_Errors.Result.tsv'))
194
195
196
197 if probed_filename and not vcf_output:
198 proBED = loadProBED()
199
200 if probed_filename:
201 with open(output_filename, 'w') as tsvout:
202 tsvout = csv.writer(tsvout,
203 delimiter='\t',
204 escapechar=' ',
205 quoting=csv.QUOTE_NONE)
206 n = 11 #Index for proteogenomic column start
207 reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)')
208 SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)')
209 pep_muts = {}
210 pep_map = {}
211 rows = []
212 for row in vad:
213 row = row.decode().split('\t')
214 row[-1] = row[-1].replace('\n','')
215 if row and row[0] and not row[0].startswith('#'):
216 # checks if the row begins with input line
217 if row[0].startswith('Input line'):
218 vad_headers = row
219
220 else:
221 # Initially screens through the output Variant
222 # Additional Details to catch mutations on
223 # same peptide region
224 genchrom = row[vad_headers.index('Chromosome')]
225 genpos = int(row[vad_headers.index('Position')])
226 aa_change = row[vad_headers.index('Protein sequence change')]
227 input_line = row[vad_headers.index('Input line')]
228
229 for peptide in proBED:
230 pepseq = peptide[3]
231 pepchrom = peptide[0]
232 pepposA = int(peptide[1])
233 pepposB = int(peptide[2])
234 if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB:
235 strand = row[vad_headers.index('Strand')]
236 transcript_strand = row[vad_headers.index('S.O. transcript strand')]
237
238 # Calculates the position of the variant
239 # amino acid(s) on peptide
240 if transcript_strand == strand:
241 aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1)
242 if (strand == '-' or transcript_strand == '-'
243 or aa_peppos >= len(pepseq)):
244 aa_peppos = int(math.floor((pepposB - genpos)/3.0))
245 if pepseq in pep_muts:
246 if aa_change not in pep_muts[pepseq]:
247 pep_muts[pepseq][aa_change] = [aa_peppos]
248 else:
249 if aa_peppos not in pep_muts[pepseq][aa_change]:
250 pep_muts[pepseq][aa_change].append(aa_peppos)
251 else:
252 pep_muts[pepseq] = {aa_change : [aa_peppos]}
253 # Stores the intersect information by mapping
254 # Input Line (CRAVAT output) to peptide sequence.
255 if input_line in pep_map:
256 if pepseq not in pep_map[input_line]:
257 pep_map[input_line].append(pepseq)
258 else:
259 pep_map[input_line] = [pepseq]
260 # TODO: Need to obtain strand information as
261 # well i.e., positive (+) or negative (-)
262
263
264 with open(output_filename, 'w') as tsvout:
265 tsvout = csv.writer(tsvout, delimiter='\t', escapechar='',
266 quoting=csv.QUOTE_NONE)
267 headers = []
268 duplicate_indices = []
269
270 # loops through each row in the Variant Additional Details (VAD) file
271 for x, row in enumerate(variants):
272 row = row.decode().split('\t')
273 row[-1] = row[-1].replace('\n','')
274 # sets row_2 equal to the same row in Variant Result (VR) file
275 row_2 = vad[x].decode().split('\t')
276 row_2[-1] = row_2[-1].replace('\n','')
277
278 # checks if row is empty or if the first term contains '#'
279 if not row or not row[0] or row[0].startswith('#'):
280 if row[0]:
281 tsvout.writerow(row)
282 else:
283 if row[0].startswith('Input line'):
284 # goes through each value in the headers list in VAD
285 headers = row
286 # loops through the Keys in VR
287 for i,value in enumerate(row_2):
288 #Checks if the value is already in headers
289 if value in headers:
290 duplicate_indices.append(i)
291 continue
292 #else adds the header to headers
293 else:
294 headers.append(value)
295 # adds appropriate headers when proteomic input is supplied
296 if probed_filename:
297 headers.insert(n, 'Variant peptide')
298 headers.insert(n, 'Reference peptide')
299 tsvout.writerow(headers)
300 else:
301 cells = []
302 # goes through each value in the next list
303 for value in row:
304 #adds it to cells
305 cells.append(value)
306 # goes through each value from the VR file after position
307 # 11 (After it is done repeating from VAD file)
308 for i,value in enumerate(row_2):
309 # adds in the rest of the values to cells
310 if i not in duplicate_indices:
311 # Skips the initial 11 columns and the
312 # VEST p-value (already in VR file)
313 cells.append(value)
314
315 # Verifies the peptides intersected previously through
316 # sequences obtained from Ensembl's API
317 if probed_filename:
318 cells.insert(n,'')
319 cells.insert(n,'')
320 input_line = cells[headers.index('Input line')]
321 if input_line in pep_map:
322 pepseq = pep_map[input_line][0]
323 aa_changes = pep_muts[pepseq]
324 transcript_id = cells[headers.index('S.O. transcript')]
325 ref_fullseq = getSequence(transcript_id)
326 # Checks the other S.O. transcripts if the primary
327 # S.O. transcript has no sequence available
328 if not ref_fullseq:
329 transcripts = cells[headers.index('S.O. all transcripts')]
330 for transcript in transcripts.split(','):
331 if transcript:
332 mat = SOtranscripts.search(transcript)
333 ref_fullseq = getSequence(mat.group(1))
334 if ref_fullseq:
335 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
336 break
337 # Resubmits the previous transcripts without
338 # extensions if all S.O. transcripts fail to
339 # provide a sequence
340 if not ref_fullseq:
341 transcripts = cells[headers.index('S.O. all transcripts')]
342 for transcript in transcripts.split(','):
343 if transcript:
344 mat = SOtranscripts.search(transcript)
345 ref_fullseq = getSequence(mat.group(1).split('.')[0])
346 if ref_fullseq:
347 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
348 break
349 if ref_fullseq:
350 # Sorts the amino acid changes
351 positions = {}
352 for aa_change in aa_changes:
353 m = reg_seq_change.search(aa_change)
354 aa_protpos = int(m.group(2))
355 aa_peppos = aa_changes[aa_change][0]
356 aa_startpos = aa_protpos - aa_peppos - 1
357 if aa_startpos in positions:
358 positions[aa_startpos].append(aa_change)
359 else:
360 positions[aa_startpos] = [aa_change]
361 # Goes through the sorted categories to mutate the Ensembl peptide
362 # (uses proBED peptide as a reference)
363 for pep_protpos in positions:
364 ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)]
365 muts = positions[pep_protpos]
366 options = []
367 mut_seq = ref_seq
368 for mut in muts:
369 m = reg_seq_change.search(mut)
370 ref_aa = m.group(1)
371 mut_pos = int(m.group(2))
372 alt_aa = m.group(3)
373 pep_mutpos = mut_pos - pep_protpos - 1
374 if (ref_seq[pep_mutpos] == ref_aa
375 and (pepseq[pep_mutpos] == alt_aa
376 or pepseq[pep_mutpos] == ref_aa)):
377 if pepseq[pep_mutpos] == ref_aa:
378 mut_seq = (mut_seq[:pep_mutpos] + ref_aa
379 + mut_seq[pep_mutpos+1:])
380 else:
381 mut_seq = (mut_seq[:pep_mutpos] + alt_aa
382 + mut_seq[pep_mutpos+1:])
383 else:
384 break
385 # Adds the mutated peptide and reference peptide if mutated correctly
386 if pepseq == mut_seq:
387 cells[n+1] = pepseq
388 cells[n] = ref_seq
389 tsvout.writerow(cells)
390