Mercurial > repos > galaxyp > cravatool
comparison cravatp_submit.py @ 1:2c7bcc1219fc draft
Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
author | galaxyp |
---|---|
date | Thu, 16 Aug 2018 12:27:35 -0400 |
parents | |
children | a018c44dc18b |
comparison
equal
deleted
inserted
replaced
0:83181dabeb90 | 1:2c7bcc1219fc |
---|---|
1 # -*- coding: utf-8 -*- | |
2 # | |
3 # Author: Ray W. Sajulga | |
4 # | |
5 # | |
6 | |
7 import requests # pipenv requests | |
8 import json | |
9 import time | |
10 import urllib | |
11 import sys | |
12 import csv | |
13 import re | |
14 import math | |
15 import argparse | |
16 from xml.etree import ElementTree as ET | |
17 from zipfile import ZipFile | |
18 try: #Python 3 | |
19 from urllib.request import urlopen | |
20 except ImportError: #Python 2 | |
21 from urllib2 import urlopen | |
22 from io import BytesIO | |
23 | |
24 # initializes blank parameters | |
25 chasm_classifier = '' | |
26 probed_filename = None | |
27 intersected_only = False | |
28 vcf_output = None | |
29 analysis_type = None | |
30 | |
31 # # Testing Command | |
32 # python cravatp_submit.py test-data/Freebayes_two-variants.vcf GRCh38 | |
33 # test-data/variant.tsv test-data/gene.tsv test-data/noncoding.tsv | |
34 # test-data/error.tsv CHASM -—classifier Breast -—proBED | |
35 # test-data/MCF7_proBed.bed | |
36 parser = argparse.ArgumentParser() | |
37 parser.add_argument('cravatInput',help='The filename of the input ' | |
38 'CRAVAT-formatted tabular file ' | |
39 '(e.g., VCF)') | |
40 parser.add_argument('GRCh', help='The name of the human reference ' | |
41 'genome used for annotation: ' | |
42 'GRCh38/hg38 or GRCh37/hg19') | |
43 parser.add_argument('variant', help='The filename of the output ' | |
44 'variant file') | |
45 parser.add_argument('gene', help='The filename of the output gene ' | |
46 'variant report') | |
47 parser.add_argument('noncoding', help='The filename of the output ' | |
48 'non-coding variant report') | |
49 parser.add_argument('error', help='The filename of the output error ' | |
50 'file') | |
51 parser.add_argument('analysis', help='The machine-learning algorithm ' | |
52 'used for CRAVAT annotation (VEST' | |
53 ' and/or CHASM)') | |
54 parser.add_argument('--classifier', help='The cancer classifier for the' | |
55 ' CHASM algorithm') | |
56 parser.add_argument('--proBED', help='The filename of the proBED file ' | |
57 'containing peptides with genomic ' | |
58 'coordinates') | |
59 parser.add_argument('--intersectOnly', help='Specifies whether to ' | |
60 'analyze only variants ' | |
61 'intersected between the ' | |
62 'CRAVAT input and proBED ' | |
63 'file') | |
64 parser.add_argument('--vcfOutput', help='The output filename of the ' | |
65 'intersected VCF file') | |
66 | |
67 # assigns parsed arguments to appropriate variables | |
68 args = parser.parse_args() | |
69 input_filename = args.cravatInput | |
70 GRCh_build = args.GRCh | |
71 output_filename = args.variant | |
72 file_3 = args.gene | |
73 file_4 = args.noncoding | |
74 file_5 = args.error | |
75 if args.analysis != 'None': | |
76 analysis_type = args.analysis | |
77 if args.classifier: | |
78 chasm_classifier = args.classifier | |
79 if args.proBED: | |
80 probed_filename = args.proBED | |
81 if args.intersectOnly: | |
82 intersected_only = args.intersectOnly | |
83 if args.vcfOutput: | |
84 vcf_output = args.vcfOutput | |
85 | |
86 if analysis_type and '+' in analysis_type: | |
87 analysis_type = 'CHASM;VEST' | |
88 | |
89 # obtains the transcript's protein sequence using Ensembl API | |
90 def getSequence(transcript_id): | |
91 server = 'http://rest.ensembl.org' | |
92 ext = ('/sequence/id/' + transcript_id | |
93 + '?content-type=text/x-seqxml%2Bxml;' | |
94 'multiple_sequences=1;type=protein') | |
95 req = requests.get(server+ext, | |
96 headers={ "Content-Type" : "text/plain"}) | |
97 | |
98 if not req.ok: | |
99 return None | |
100 | |
101 root = ET.fromstring(req.content) | |
102 for child in root.iter('AAseq'): | |
103 return child.text | |
104 | |
105 # parses the proBED file as a list. | |
106 def loadProBED(): | |
107 proBED = [] | |
108 with open(probed_filename) as tsvin: | |
109 tsvreader = csv.reader(tsvin, delimiter='\t') | |
110 for i, row in enumerate(tsvreader): | |
111 proBED.append(row) | |
112 return proBED | |
113 | |
114 write_header = True | |
115 | |
116 | |
117 # Creates an VCF file that only contains variants that overlap with the | |
118 # proteogenomic input (proBED) file if the user specifies that they want | |
119 # to only include intersected variants or if they want to receive the | |
120 # intersected VCF as well. | |
121 if probed_filename and (vcf_output or intersected_only == 'true'): | |
122 proBED = loadProBED() | |
123 if not vcf_output: | |
124 vcf_output = 'intersected_input.vcf' | |
125 with open(input_filename) as tsvin, open(vcf_output, 'wb') as tsvout: | |
126 tsvreader = csv.reader(tsvin, delimiter='\t') | |
127 tsvout = csv.writer(tsvout, delimiter='\t', escapechar=' ', | |
128 quoting=csv.QUOTE_NONE) | |
129 | |
130 for row in tsvreader: | |
131 if row == [] or row[0][0] == '#': | |
132 tsvout.writerow(row) | |
133 else: | |
134 genchrom = row[0] | |
135 genpos = int(row[1]) | |
136 | |
137 for peptide in proBED: | |
138 pepchrom = peptide[0] | |
139 pepposA = int(peptide[1]) | |
140 pepposB = int(peptide[2]) | |
141 if (genchrom == pepchrom and | |
142 pepposA <= genpos and | |
143 genpos <= pepposB): | |
144 tsvout.writerow(row) | |
145 break | |
146 if intersected_only == 'true': | |
147 input_filename = vcf_output | |
148 | |
149 # sets up the parameters for submission to the CRAVAT API | |
150 parameters = {'email':'rsajulga@umn.edu', | |
151 'hg19': 'on' if GRCh_build == 'GRCh37' else 'off', | |
152 'functionalannotation': 'on', | |
153 'tsvreport' : 'on', | |
154 'mupitinput' : 'on'} | |
155 if analysis_type: | |
156 parameters['analyses'] = analysis_type | |
157 if chasm_classifier: | |
158 parameters['chasmclassifier'] = chasm_classifier | |
159 | |
160 # plugs in params to given URL | |
161 submit = requests.post('http://www.cravat.us/CRAVAT/rest/service/submit', | |
162 files = {'inputfile':open(input_filename)}, | |
163 data = parameters) | |
164 | |
165 # makes the data a json dictionary; takes out only the job ID | |
166 jobid = json.loads(submit.text)['jobid'] | |
167 | |
168 # loops until we find a status equal to Success, then breaks | |
169 while True: | |
170 check = requests.get( | |
171 'http://www.cravat.us/CRAVAT/rest/service/status', | |
172 params = {'jobid' : jobid}) | |
173 status = json.loads(check.text)['status'] | |
174 resultfileurl = json.loads(check.text)['resultfileurl'] | |
175 #out_file.write(str(status) + ', ') | |
176 if status == 'Success': | |
177 #out_file.write('\t' + resultfileurl) | |
178 break | |
179 else: | |
180 time.sleep(2) | |
181 | |
182 # obtains the zipfile created by CRAVAT and loads the variants and VAD | |
183 # file for processing | |
184 r = requests.get(resultfileurl, stream=True) | |
185 url = urlopen(resultfileurl) | |
186 zipfile = ZipFile(BytesIO(r.content)) | |
187 variants = zipfile.open(jobid + '/Variant.Result.tsv').readlines() | |
188 vad = zipfile.open(jobid + '/Variant_Additional_Details.Result.tsv').readlines() | |
189 | |
190 # reads and writes the gene, noncoding, and error files | |
191 open(file_3, 'wb').write(zipfile.read(jobid + '/Gene_Level_Analysis.Result.tsv')) | |
192 open(file_4, 'wb').write(zipfile.read(jobid + '/Variant_Non-coding.Result.tsv')) | |
193 open(file_5, 'wb').write(zipfile.read(jobid + '/Input_Errors.Result.tsv')) | |
194 | |
195 | |
196 | |
197 if probed_filename and not vcf_output: | |
198 proBED = loadProBED() | |
199 | |
200 if probed_filename: | |
201 with open(output_filename, 'w') as tsvout: | |
202 tsvout = csv.writer(tsvout, | |
203 delimiter='\t', | |
204 escapechar=' ', | |
205 quoting=csv.QUOTE_NONE) | |
206 n = 11 #Index for proteogenomic column start | |
207 reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)') | |
208 SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)') | |
209 pep_muts = {} | |
210 pep_map = {} | |
211 rows = [] | |
212 for row in vad: | |
213 row = row.decode().split('\t') | |
214 row[-1] = row[-1].replace('\n','') | |
215 if row and row[0] and not row[0].startswith('#'): | |
216 # checks if the row begins with input line | |
217 if row[0].startswith('Input line'): | |
218 vad_headers = row | |
219 | |
220 else: | |
221 # Initially screens through the output Variant | |
222 # Additional Details to catch mutations on | |
223 # same peptide region | |
224 genchrom = row[vad_headers.index('Chromosome')] | |
225 genpos = int(row[vad_headers.index('Position')]) | |
226 aa_change = row[vad_headers.index('Protein sequence change')] | |
227 input_line = row[vad_headers.index('Input line')] | |
228 | |
229 for peptide in proBED: | |
230 pepseq = peptide[3] | |
231 pepchrom = peptide[0] | |
232 pepposA = int(peptide[1]) | |
233 pepposB = int(peptide[2]) | |
234 if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB: | |
235 strand = row[vad_headers.index('Strand')] | |
236 transcript_strand = row[vad_headers.index('S.O. transcript strand')] | |
237 | |
238 # Calculates the position of the variant | |
239 # amino acid(s) on peptide | |
240 if transcript_strand == strand: | |
241 aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1) | |
242 if (strand == '-' or transcript_strand == '-' | |
243 or aa_peppos >= len(pepseq)): | |
244 aa_peppos = int(math.floor((pepposB - genpos)/3.0)) | |
245 if pepseq in pep_muts: | |
246 if aa_change not in pep_muts[pepseq]: | |
247 pep_muts[pepseq][aa_change] = [aa_peppos] | |
248 else: | |
249 if aa_peppos not in pep_muts[pepseq][aa_change]: | |
250 pep_muts[pepseq][aa_change].append(aa_peppos) | |
251 else: | |
252 pep_muts[pepseq] = {aa_change : [aa_peppos]} | |
253 # Stores the intersect information by mapping | |
254 # Input Line (CRAVAT output) to peptide sequence. | |
255 if input_line in pep_map: | |
256 if pepseq not in pep_map[input_line]: | |
257 pep_map[input_line].append(pepseq) | |
258 else: | |
259 pep_map[input_line] = [pepseq] | |
260 # TODO: Need to obtain strand information as | |
261 # well i.e., positive (+) or negative (-) | |
262 | |
263 | |
264 with open(output_filename, 'w') as tsvout: | |
265 tsvout = csv.writer(tsvout, delimiter='\t', escapechar='', | |
266 quoting=csv.QUOTE_NONE) | |
267 headers = [] | |
268 duplicate_indices = [] | |
269 | |
270 # loops through each row in the Variant Additional Details (VAD) file | |
271 for x, row in enumerate(variants): | |
272 row = row.decode().split('\t') | |
273 row[-1] = row[-1].replace('\n','') | |
274 # sets row_2 equal to the same row in Variant Result (VR) file | |
275 row_2 = vad[x].decode().split('\t') | |
276 row_2[-1] = row_2[-1].replace('\n','') | |
277 | |
278 # checks if row is empty or if the first term contains '#' | |
279 if not row or not row[0] or row[0].startswith('#'): | |
280 if row[0]: | |
281 tsvout.writerow(row) | |
282 else: | |
283 if row[0].startswith('Input line'): | |
284 # goes through each value in the headers list in VAD | |
285 headers = row | |
286 # loops through the Keys in VR | |
287 for i,value in enumerate(row_2): | |
288 #Checks if the value is already in headers | |
289 if value in headers: | |
290 duplicate_indices.append(i) | |
291 continue | |
292 #else adds the header to headers | |
293 else: | |
294 headers.append(value) | |
295 # adds appropriate headers when proteomic input is supplied | |
296 if probed_filename: | |
297 headers.insert(n, 'Variant peptide') | |
298 headers.insert(n, 'Reference peptide') | |
299 tsvout.writerow(headers) | |
300 else: | |
301 cells = [] | |
302 # goes through each value in the next list | |
303 for value in row: | |
304 #adds it to cells | |
305 cells.append(value) | |
306 # goes through each value from the VR file after position | |
307 # 11 (After it is done repeating from VAD file) | |
308 for i,value in enumerate(row_2): | |
309 # adds in the rest of the values to cells | |
310 if i not in duplicate_indices: | |
311 # Skips the initial 11 columns and the | |
312 # VEST p-value (already in VR file) | |
313 cells.append(value) | |
314 | |
315 # Verifies the peptides intersected previously through | |
316 # sequences obtained from Ensembl's API | |
317 if probed_filename: | |
318 cells.insert(n,'') | |
319 cells.insert(n,'') | |
320 input_line = cells[headers.index('Input line')] | |
321 if input_line in pep_map: | |
322 pepseq = pep_map[input_line][0] | |
323 aa_changes = pep_muts[pepseq] | |
324 transcript_id = cells[headers.index('S.O. transcript')] | |
325 ref_fullseq = getSequence(transcript_id) | |
326 # Checks the other S.O. transcripts if the primary | |
327 # S.O. transcript has no sequence available | |
328 if not ref_fullseq: | |
329 transcripts = cells[headers.index('S.O. all transcripts')] | |
330 for transcript in transcripts.split(','): | |
331 if transcript: | |
332 mat = SOtranscripts.search(transcript) | |
333 ref_fullseq = getSequence(mat.group(1)) | |
334 if ref_fullseq: | |
335 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} | |
336 break | |
337 # Resubmits the previous transcripts without | |
338 # extensions if all S.O. transcripts fail to | |
339 # provide a sequence | |
340 if not ref_fullseq: | |
341 transcripts = cells[headers.index('S.O. all transcripts')] | |
342 for transcript in transcripts.split(','): | |
343 if transcript: | |
344 mat = SOtranscripts.search(transcript) | |
345 ref_fullseq = getSequence(mat.group(1).split('.')[0]) | |
346 if ref_fullseq: | |
347 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} | |
348 break | |
349 if ref_fullseq: | |
350 # Sorts the amino acid changes | |
351 positions = {} | |
352 for aa_change in aa_changes: | |
353 m = reg_seq_change.search(aa_change) | |
354 aa_protpos = int(m.group(2)) | |
355 aa_peppos = aa_changes[aa_change][0] | |
356 aa_startpos = aa_protpos - aa_peppos - 1 | |
357 if aa_startpos in positions: | |
358 positions[aa_startpos].append(aa_change) | |
359 else: | |
360 positions[aa_startpos] = [aa_change] | |
361 # Goes through the sorted categories to mutate the Ensembl peptide | |
362 # (uses proBED peptide as a reference) | |
363 for pep_protpos in positions: | |
364 ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)] | |
365 muts = positions[pep_protpos] | |
366 options = [] | |
367 mut_seq = ref_seq | |
368 for mut in muts: | |
369 m = reg_seq_change.search(mut) | |
370 ref_aa = m.group(1) | |
371 mut_pos = int(m.group(2)) | |
372 alt_aa = m.group(3) | |
373 pep_mutpos = mut_pos - pep_protpos - 1 | |
374 if (ref_seq[pep_mutpos] == ref_aa | |
375 and (pepseq[pep_mutpos] == alt_aa | |
376 or pepseq[pep_mutpos] == ref_aa)): | |
377 if pepseq[pep_mutpos] == ref_aa: | |
378 mut_seq = (mut_seq[:pep_mutpos] + ref_aa | |
379 + mut_seq[pep_mutpos+1:]) | |
380 else: | |
381 mut_seq = (mut_seq[:pep_mutpos] + alt_aa | |
382 + mut_seq[pep_mutpos+1:]) | |
383 else: | |
384 break | |
385 # Adds the mutated peptide and reference peptide if mutated correctly | |
386 if pepseq == mut_seq: | |
387 cells[n+1] = pepseq | |
388 cells[n] = ref_seq | |
389 tsvout.writerow(cells) | |
390 |