Mercurial > repos > galaxyp > cravatool
comparison cravat_submit.py @ 0:83181dabeb90 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatool commit 4f73619e5f750916a9971e433ddd6b8dee0d7dd3
author | galaxyp |
---|---|
date | Fri, 18 May 2018 13:25:29 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:83181dabeb90 |
---|---|
1 import requests | |
2 import json | |
3 import time | |
4 import urllib | |
5 import sys | |
6 import csv | |
7 import re | |
8 import math | |
9 from difflib import SequenceMatcher | |
10 from xml.etree import ElementTree as ET | |
11 import sqlite3 | |
12 | |
13 try: | |
14 input_filename = sys.argv[1] | |
15 input_select_bar = sys.argv[2] | |
16 GRCh_build = sys.argv[3] | |
17 probed_filename = sys.argv[4] | |
18 output_filename = sys.argv[5] | |
19 file_3 = sys.argv[6] | |
20 file_4 = sys.argv[7] | |
21 file_5 = sys.argv[8] | |
22 except: | |
23 # Filenames for testing. | |
24 input_filename = 'test-data/[VCF-BEDintersect__on_data_65_and_data_6].vcf' | |
25 probed_filename = 'test-data/[PepPointer].bed' | |
26 input_select_bar = 'VEST' | |
27 GRCh_build = 'GRCh38' | |
28 output_filename = 'combined_variants.tsv' | |
29 file_3 = 'test-results/Gene_Level_Analysis.tsv' | |
30 file_4 = 'test-results/Variant_Non-coding.Result.tsv' | |
31 file_5 = 'test-results/Input_Errors.Result.tsv' | |
32 matches_filename = 'matches.tsv' | |
33 | |
34 def getSequence(transcript_id): | |
35 server = 'http://rest.ensembl.org' | |
36 ext = '/sequence/id/' + transcript_id + '?content-type=text/x-seqxml%2Bxml;multiple_sequences=1;type=protein' | |
37 req = requests.get(server+ext, headers={ "Content-Type" : "text/plain"}) | |
38 | |
39 if not req.ok: | |
40 return None | |
41 | |
42 root = ET.fromstring(req.content) | |
43 for child in root.iter('AAseq'): | |
44 return child.text | |
45 | |
46 | |
47 write_header = True | |
48 | |
49 GRCh37hg19 = 'off' | |
50 if GRCh_build == 'GRCh37': | |
51 GRCh37hg19 = 'on' | |
52 | |
53 #plugs in params to given URL | |
54 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19}) | |
55 | |
56 #Makes the data a json dictionary, takes out only the job ID | |
57 jobid = json.loads(submit.text)['jobid'] | |
58 | |
59 #out_file.write(jobid) | |
60 submitted = json.loads(submit.text)['status'] | |
61 #out_file.write('\t' + submitted) | |
62 | |
63 input_file = open(input_filename) | |
64 | |
65 # Loads the proBED file as a list. | |
66 if (probed_filename != 'None'): | |
67 proBED = [] | |
68 with open(probed_filename) as tsvin: | |
69 tsvreader = csv.reader(tsvin, delimiter='\t') | |
70 for i, row in enumerate(tsvreader): | |
71 proBED.append(row) | |
72 | |
73 #loops until we find a status equal to Success, then breaks | |
74 while True: | |
75 check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid}) | |
76 status = json.loads(check.text)['status'] | |
77 resultfileurl = json.loads(check.text)['resultfileurl'] | |
78 #out_file.write(str(status) + ', ') | |
79 if status == 'Success': | |
80 #out_file.write('\t' + resultfileurl) | |
81 break | |
82 else: | |
83 time.sleep(2) | |
84 | |
85 #out_file.write('\n') | |
86 | |
87 #creates three files | |
88 file_1 = 'Variant_Result.tsv' | |
89 file_2 = 'Additional_Details.tsv' | |
90 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv' | |
91 | |
92 #Downloads the tabular results | |
93 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1) | |
94 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2) | |
95 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3) | |
96 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4) | |
97 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5) | |
98 | |
99 #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer | |
100 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: | |
101 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') | |
102 tsvout = csv.writer(tsvout, delimiter='\t') | |
103 | |
104 headers = [] | |
105 duplicate_indices = [] | |
106 n = 12 #Index for proteogenomic column start | |
107 reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)') | |
108 SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)') | |
109 pep_muts = {} | |
110 pep_map = {} | |
111 rows = [] | |
112 | |
113 for row in tsvreader_2: | |
114 if row != [] and row[0][0] != '#': | |
115 #checks if the row begins with input line | |
116 if row[0] == 'Input line': | |
117 vad_headers = row | |
118 else: | |
119 # Initially screens through the output Variant Additional Details to catch mutations on same peptide region | |
120 genchrom = row[vad_headers.index('Chromosome')] | |
121 genpos = int(row[vad_headers.index('Position')]) | |
122 aa_change = row[vad_headers.index('Protein sequence change')] | |
123 input_line = row[vad_headers.index('Input line')] | |
124 | |
125 for peptide in proBED: | |
126 pepseq = peptide[3] | |
127 pepchrom = peptide[0] | |
128 pepposA = int(peptide[1]) | |
129 pepposB = int(peptide[2]) | |
130 if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB: | |
131 strand = row[vad_headers.index('Strand')] | |
132 transcript_strand = row[vad_headers.index('S.O. transcript strand')] | |
133 | |
134 # Calculates the position of the variant amino acid(s) on peptide | |
135 if transcript_strand == strand: | |
136 aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1) | |
137 if strand == '-' or transcript_strand == '-' or aa_peppos >= len(pepseq): | |
138 aa_peppos = int(math.floor((pepposB - genpos)/3.0)) | |
139 if pepseq in pep_muts: | |
140 if aa_change not in pep_muts[pepseq]: | |
141 pep_muts[pepseq][aa_change] = [aa_peppos] | |
142 else: | |
143 if aa_peppos not in pep_muts[pepseq][aa_change]: | |
144 pep_muts[pepseq][aa_change].append(aa_peppos) | |
145 else: | |
146 pep_muts[pepseq] = {aa_change : [aa_peppos]} | |
147 # Stores the intersect information by mapping Input Line (CRAVAT output) to peptide sequence. | |
148 if input_line in pep_map: | |
149 if pepseq not in pep_map[input_line]: | |
150 pep_map[input_line].append(pepseq) | |
151 else: | |
152 pep_map[input_line] = [pepseq] | |
153 | |
154 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: | |
155 tsvreader_1 = csv.reader(tsvin_1, delimiter='\t') | |
156 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') | |
157 tsvout = csv.writer(tsvout, delimiter='\t') | |
158 | |
159 headers = [] | |
160 | |
161 #loops through each row in the Variant Additional Details (VAD) file | |
162 for row in tsvreader_2: | |
163 | |
164 #sets row_2 equal to the same row in Variant Result (VR) file | |
165 row_2 = tsvreader_1.next() | |
166 #checks if row is empty or if the first term contains '#' | |
167 if row == [] or row[0][0] == '#': | |
168 tsvout.writerow(row) | |
169 else: | |
170 if row[0] == 'Input line': | |
171 #Goes through each value in the headers list in VAD | |
172 for value in row: | |
173 #Adds each value into headers | |
174 headers.append(value) | |
175 #Loops through the Keys in VR | |
176 for i,value in enumerate(row_2): | |
177 #Checks if the value is already in headers | |
178 if value in headers: | |
179 duplicate_indices.append(i) | |
180 continue | |
181 #else adds the header to headers | |
182 else: | |
183 headers.append(value) | |
184 #Adds appropriate headers when proteomic input is supplied | |
185 if (probed_filename != 'None'): | |
186 headers.insert(n, 'Variant peptide') | |
187 headers.insert(n, 'Reference peptide') | |
188 tsvout.writerow(headers) | |
189 else: | |
190 cells = [] | |
191 #Goes through each value in the next list | |
192 for value in row: | |
193 #adds it to cells | |
194 cells.append(value) | |
195 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file) | |
196 for i,value in enumerate(row_2): | |
197 #adds in the rest of the values to cells | |
198 if i not in duplicate_indices: | |
199 # Skips the initial 11 columns and the VEST p-value (already in VR file) | |
200 cells.append(value) | |
201 | |
202 # Verifies the peptides intersected previously through sequences obtained from Ensembl's API | |
203 if (probed_filename != 'None'): | |
204 cells.insert(n,'') | |
205 cells.insert(n,'') | |
206 input_line = cells[headers.index('Input line')] | |
207 if input_line in pep_map: | |
208 pepseq = pep_map[input_line][0] | |
209 aa_changes = pep_muts[pepseq] | |
210 transcript_id = cells[headers.index('S.O. transcript')] | |
211 ref_fullseq = getSequence(transcript_id) | |
212 # Checks the other S.O. transcripts if the primary S.O. transcript has no sequence available | |
213 if not ref_fullseq: | |
214 transcripts = cells[headers.index('S.O. all transcripts')] | |
215 for transcript in transcripts.split(','): | |
216 if transcript: | |
217 mat = SOtranscripts.search(transcript) | |
218 ref_fullseq = getSequence(mat.group(1)) | |
219 if ref_fullseq: | |
220 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} | |
221 break | |
222 # Resubmits the previous transcripts without extensions if all S.O. transcripts fail to provide a sequence | |
223 if not ref_fullseq: | |
224 transcripts = cells[headers.index('S.O. all transcripts')] | |
225 for transcript in transcripts.split(','): | |
226 if transcript: | |
227 mat = SOtranscripts.search(transcript) | |
228 ref_fullseq = getSequence(mat.group(1).split('.')[0]) | |
229 if ref_fullseq: | |
230 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} | |
231 break | |
232 if ref_fullseq: | |
233 # Sorts the amino acid changes | |
234 positions = {} | |
235 for aa_change in aa_changes: | |
236 m = reg_seq_change.search(aa_change) | |
237 aa_protpos = int(m.group(2)) | |
238 aa_peppos = aa_changes[aa_change][0] | |
239 aa_startpos = aa_protpos - aa_peppos - 1 | |
240 if aa_startpos in positions: | |
241 positions[aa_startpos].append(aa_change) | |
242 else: | |
243 positions[aa_startpos] = [aa_change] | |
244 # Goes through the sorted categories to mutate the Ensembl peptide (uses proBED peptide as a reference) | |
245 for pep_protpos in positions: | |
246 ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)] | |
247 muts = positions[pep_protpos] | |
248 options = [] | |
249 mut_seq = ref_seq | |
250 for mut in muts: | |
251 m = reg_seq_change.search(mut) | |
252 ref_aa = m.group(1) | |
253 mut_pos = int(m.group(2)) | |
254 alt_aa = m.group(3) | |
255 pep_mutpos = mut_pos - pep_protpos - 1 | |
256 if ref_seq[pep_mutpos] == ref_aa and (pepseq[pep_mutpos] == alt_aa or pepseq[pep_mutpos] == ref_aa): | |
257 if pepseq[pep_mutpos] == ref_aa: | |
258 mut_seq = mut_seq[:pep_mutpos] + ref_aa + mut_seq[pep_mutpos+1:] | |
259 else: | |
260 mut_seq = mut_seq[:pep_mutpos] + alt_aa + mut_seq[pep_mutpos+1:] | |
261 else: | |
262 break | |
263 # Adds the mutated peptide and reference peptide if mutated correctly | |
264 if pepseq == mut_seq: | |
265 cells[n+1] = pepseq | |
266 cells[n] = ref_seq | |
267 #print cells | |
268 tsvout.writerow(cells) | |
269 | |
270 | |
271 | |
272 | |
273 | |
274 | |
275 #a = 'col1\tcol2\tcol3' | |
276 #header_list = a.split('\t') | |
277 | |
278 #loop through the two results, when you first hit header you print out the headers in tabular form | |
279 #Print out each header only once | |
280 #Combine both headers into one output file | |
281 #loop through the rest of the data and assign each value to its assigned header | |
282 #combine this all into one output file | |
283 | |
284 | |
285 | |
286 | |
287 |