Mercurial > repos > rsajulga > proteomic_cravat_score_and_annotate
comparison cravat_submit.py @ 2:676c8be98be4 draft
Uploaded
author | rsajulga |
---|---|
date | Thu, 17 May 2018 22:32:25 -0400 |
parents | 7ebdd4ac13a2 |
children |
comparison
equal
deleted
inserted
replaced
1:6e90e7ad5e09 | 2:676c8be98be4 |
---|---|
3 import time | 3 import time |
4 import urllib | 4 import urllib |
5 import sys | 5 import sys |
6 import csv | 6 import csv |
7 import re | 7 import re |
8 import math | |
9 from difflib import SequenceMatcher | |
10 from xml.etree import ElementTree as ET | |
11 import sqlite3 | |
8 | 12 |
9 try: | 13 try: |
10 input_filename = sys.argv[1] | 14 input_filename = sys.argv[1] |
11 input_select_bar = sys.argv[2] | 15 input_select_bar = sys.argv[2] |
12 GRCh_build = sys.argv[3] | 16 GRCh_build = sys.argv[3] |
13 psm_filename = sys.argv[4] | 17 probed_filename = sys.argv[4] |
14 output_filename = sys.argv[5] | 18 output_filename = sys.argv[5] |
15 file_3 = sys.argv[6] | 19 file_3 = sys.argv[6] |
16 file_4 = sys.argv[7] | 20 file_4 = sys.argv[7] |
17 file_5 = sys.argv[8] | 21 file_5 = sys.argv[8] |
18 except: | 22 except: |
19 input_filename = '1.) Galaxy2-[Human_Vcf_MCF7]-minimum.vcf' | 23 # Filenames for testing. |
20 input_filename = 'input/[tgriffin_cguerrer_20160726_MCF7_RNAseq_01_S13_R1_001.vcf].vcf' | 24 input_filename = 'input/Galaxy68-[VCF-BEDintersect__on_data_65_and_data_6].vcf' |
21 input_select_bar = 'VEST' | 25 input_select_bar = 'VEST' |
22 GRCh_build = 'GRCh38' | 26 GRCh_build = 'GRCh38' |
23 output_filename = 'combined_variants.tsv' | 27 output_filename = 'combined_variants.tsv' |
24 psm_filename = 'input/[ERLIC_MCF7_110kb_R123-CustomProDB_RNA-Seq_cRAP_DB.psm-report].tabular' | 28 probed_filename = 'input/Galaxy66-[PepPointer].bed' |
25 file_3 = 'output/' + time.strftime("%H:%M") + '_Z_Gene_Level_Analysis.tsv' | 29 file_3 = 'output/Gene_Level_Analysis.tsv' |
26 file_4 = 'output/' + time.strftime("%H:%M") + '_Z_Variant_Non-coding.Result.tsv' | 30 file_4 = 'output/Variant_Non-coding.Result.tsv' |
27 file_5 = 'output/' + time.strftime("%H:%M") + '_Z_Input_Errors.Result.tsv' | 31 file_5 = 'output/Input_Errors.Result.tsv' |
28 | 32 matches_filename = 'matches.tsv' |
29 | 33 |
30 #in_file = open('input_call.txt', "r") | 34 def getSequence(transcript_id): |
31 #out_file = open('output_call.txt', "w") | 35 server = 'http://rest.ensembl.org' |
36 ext = '/sequence/id/' + transcript_id + '?content-type=text/x-seqxml%2Bxml;multiple_sequences=1;type=protein' | |
37 req = requests.get(server+ext, headers={ "Content-Type" : "text/plain"}) | |
38 | |
39 if not req.ok: | |
40 return None | |
41 | |
42 root = ET.fromstring(req.content) | |
43 for child in root.iter('AAseq'): | |
44 return child.text | |
45 | |
32 | 46 |
33 write_header = True | 47 write_header = True |
34 | 48 |
35 GRCh37hg19 = 'off' | 49 GRCh37hg19 = 'off' |
36 if GRCh_build == 'GRCh37': | 50 if GRCh_build == 'GRCh37': |
37 GRCh37hg19 = 'on' | 51 GRCh37hg19 = 'on' |
38 | 52 |
39 # http://staging.cravat.us/CRAVAT/rest/service/submit: | |
40 | |
41 #plugs in params to given URL | 53 #plugs in params to given URL |
42 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19}) | 54 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19}) |
43 | 55 |
44 #Makes the data a json dictionary, takes out only the job ID | 56 #Makes the data a json dictionary, takes out only the job ID |
45 jobid = json.loads(submit.text)['jobid'] | 57 jobid = json.loads(submit.text)['jobid'] |
58 | |
46 #out_file.write(jobid) | 59 #out_file.write(jobid) |
47 submitted = json.loads(submit.text)['status'] | 60 submitted = json.loads(submit.text)['status'] |
48 #out_file.write('\t' + submitted) | 61 #out_file.write('\t' + submitted) |
49 | 62 |
50 input_file = open(input_filename) | 63 input_file = open(input_filename) |
51 is_comment_line = re.compile(".*#+.*") | 64 |
52 | 65 # Loads the proBED file as a list. |
53 | 66 if (probed_filename != 'None'): |
67 proBED = [] | |
68 with open(probed_filename) as tsvin: | |
69 tsvreader = csv.reader(tsvin, delimiter='\t') | |
70 for i, row in enumerate(tsvreader): | |
71 proBED.append(row) | |
72 | |
54 #loops until we find a status equal to Success, then breaks | 73 #loops until we find a status equal to Success, then breaks |
55 while True: | 74 while True: |
56 check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid}) | 75 check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid}) |
57 status = json.loads(check.text)['status'] | 76 status = json.loads(check.text)['status'] |
58 resultfileurl = json.loads(check.text)['resultfileurl'] | 77 resultfileurl = json.loads(check.text)['resultfileurl'] |
60 if status == 'Success': | 79 if status == 'Success': |
61 #out_file.write('\t' + resultfileurl) | 80 #out_file.write('\t' + resultfileurl) |
62 break | 81 break |
63 else: | 82 else: |
64 time.sleep(2) | 83 time.sleep(2) |
65 | 84 |
66 #out_file.write('\n') | 85 #out_file.write('\n') |
67 | 86 |
68 #creates three files | 87 #creates three files |
69 file_1 = time.strftime("%H:%M") + '_Z_Variant_Result.tsv' | 88 file_1 = 'Variant_Result.tsv' |
70 file_2 = time.strftime("%H:%M") + '_Z_Additional_Details.tsv' | 89 file_2 = 'Additional_Details.tsv' |
71 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv' | 90 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv' |
72 | 91 |
73 #Download the two results | 92 #Downloads the tabular results |
74 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1) | 93 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1) |
75 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2) | 94 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2) |
76 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3) | 95 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3) |
77 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4) | 96 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4) |
78 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5) | 97 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5) |
79 | 98 |
80 headers = [] | |
81 duplicates = [] | |
82 | |
83 #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer | 99 #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer |
100 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: | |
101 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') | |
102 tsvout = csv.writer(tsvout, delimiter='\t') | |
103 | |
104 headers = [] | |
105 duplicate_indices = [] | |
106 n = 12 #Index for proteogenomic column start | |
107 reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)') | |
108 SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)') | |
109 pep_muts = {} | |
110 pep_map = {} | |
111 rows = [] | |
112 | |
113 for row in tsvreader_2: | |
114 if row != [] and row[0][0] != '#': | |
115 #checks if the row begins with input line | |
116 if row[0] == 'Input line': | |
117 vad_headers = row | |
118 else: | |
119 # Initially screens through the output Variant Additional Details to catch mutations on same peptide region | |
120 genchrom = row[vad_headers.index('Chromosome')] | |
121 genpos = int(row[vad_headers.index('Position')]) | |
122 aa_change = row[vad_headers.index('Protein sequence change')] | |
123 input_line = row[vad_headers.index('Input line')] | |
124 | |
125 for peptide in proBED: | |
126 pepseq = peptide[3] | |
127 pepchrom = peptide[0] | |
128 pepposA = int(peptide[1]) | |
129 pepposB = int(peptide[2]) | |
130 if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB: | |
131 strand = row[vad_headers.index('Strand')] | |
132 transcript_strand = row[vad_headers.index('S.O. transcript strand')] | |
133 | |
134 # Calculates the position of the variant amino acid(s) on peptide | |
135 if transcript_strand == strand: | |
136 aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1) | |
137 if strand == '-' or transcript_strand == '-' or aa_peppos >= len(pepseq): | |
138 aa_peppos = int(math.floor((pepposB - genpos)/3.0)) | |
139 if pepseq in pep_muts: | |
140 if aa_change not in pep_muts[pepseq]: | |
141 pep_muts[pepseq][aa_change] = [aa_peppos] | |
142 else: | |
143 if aa_peppos not in pep_muts[pepseq][aa_change]: | |
144 pep_muts[pepseq][aa_change].append(aa_peppos) | |
145 else: | |
146 pep_muts[pepseq] = {aa_change : [aa_peppos]} | |
147 # Stores the intersect information by mapping Input Line (CRAVAT output) to peptide sequence. | |
148 if input_line in pep_map: | |
149 if pepseq not in pep_map[input_line]: | |
150 pep_map[input_line].append(pepseq) | |
151 else: | |
152 pep_map[input_line] = [pepseq] | |
153 | |
84 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: | 154 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: |
85 tsvreader_1 = csv.reader(tsvin_1, delimiter='\t') | 155 tsvreader_1 = csv.reader(tsvin_1, delimiter='\t') |
86 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') | 156 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') |
87 | |
88 tsvout = csv.writer(tsvout, delimiter='\t') | 157 tsvout = csv.writer(tsvout, delimiter='\t') |
89 | 158 |
90 # Processes the PSM report | 159 headers = [] |
91 if (psm_filename != 'None'): | 160 |
92 tsvin_3 = open(psm_filename) | 161 #loops through each row in the Variant Additional Details (VAD) file |
93 psmreader = csv.reader(tsvin_3, delimiter='\t') | 162 for row in tsvreader_2: |
94 | 163 |
95 psmreader.next() | 164 #sets row_2 equal to the same row in Variant Result (VR) file |
96 peptide_map = {} | |
97 s = re.compile('[A-Z][0-9]+[A-Z]') | |
98 for row in psmreader: | |
99 pro_name = row[1] | |
100 pep_seq = row[2] | |
101 | |
102 prot_seq_changes = s.findall(pro_name) | |
103 | |
104 for change in prot_seq_changes: | |
105 if change in peptide_map: | |
106 if pep_seq not in peptide_map[change].split(';'): | |
107 peptide_map[change] = peptide_map[change] + ';' + pep_seq | |
108 else: | |
109 peptide_map[change] = pep_seq | |
110 | |
111 #loops through each row in the Variant Additional Details file | |
112 | |
113 print 'Checkpoint 3' | |
114 for row in tsvreader_2: | |
115 #sets row_2 equal to the same row in Variant Result file | |
116 row_2 = tsvreader_1.next() | 165 row_2 = tsvreader_1.next() |
117 #checks if row is empty or if the first term contains '#' | 166 #checks if row is empty or if the first term contains '#' |
118 if row == [] or row[0][0] == '#': | 167 if row == [] or row[0][0] == '#': |
119 tsvout.writerow(row) | 168 tsvout.writerow(row) |
120 else: | 169 else: |
121 #checks if the row begins with input line | 170 if row[0] == 'Input line': |
122 if row[0] == 'Input line': | |
123 #Goes through each value in the headers list in VAD | 171 #Goes through each value in the headers list in VAD |
124 #print 'Original row' | |
125 #print row | |
126 #print row_2 | |
127 for value in row: | 172 for value in row: |
128 #Adds each value into headers | 173 #Adds each value into headers |
129 headers.append(value) | 174 headers.append(value) |
130 #Loops through the Keys in VR | 175 #Loops through the Keys in VR |
131 for value in row_2: | 176 for i,value in enumerate(row_2): |
132 #Checks if the value is already in headers | 177 #Checks if the value is already in headers |
133 if value in headers: | 178 if value in headers: |
179 duplicate_indices.append(i) | |
134 continue | 180 continue |
135 #else adds the header to headers | 181 #else adds the header to headers |
136 else: | 182 else: |
137 headers.append(value) | 183 headers.append(value) |
138 if (psm_filename != 'None'): | 184 #Adds appropriate headers when proteomic input is supplied |
139 headers.insert(1, 'Peptide') | 185 if (probed_filename != 'None'): |
140 #print headers | 186 headers.insert(n, 'Variant peptide') |
187 headers.insert(n, 'Reference peptide') | |
141 tsvout.writerow(headers) | 188 tsvout.writerow(headers) |
142 else: | 189 else: |
143 | |
144 cells = [] | 190 cells = [] |
145 #Inserts a peptide column into the row | |
146 if (psm_filename != 'None'): | |
147 if row[12] in peptide_map: | |
148 row.insert(1, peptide_map[row[12]]) | |
149 else: | |
150 row.insert(1, '') | |
151 | |
152 #Goes through each value in the next list | 191 #Goes through each value in the next list |
153 for i,value in enumerate(row): | 192 for value in row: |
154 #adds it to cells | 193 #adds it to cells |
155 cells.append(value) | 194 cells.append(value) |
156 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file) | 195 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file) |
157 for i,value in enumerate(row_2[11:]): | 196 for i,value in enumerate(row_2): |
158 #adds in the rest of the values to cells | 197 #adds in the rest of the values to cells |
159 | 198 if i not in duplicate_indices: |
160 # Skips the 2nd VEST p-value | 199 # Skips the initial 11 columns and the VEST p-value (already in VR file) |
161 if (i != 49 - 11): | |
162 cells.append(value) | 200 cells.append(value) |
163 | 201 |
164 print cells | 202 # Verifies the peptides intersected previously through sequences obtained from Ensembl's API |
203 if (probed_filename != 'None'): | |
204 cells.insert(n,'') | |
205 cells.insert(n,'') | |
206 input_line = cells[headers.index('Input line')] | |
207 if input_line in pep_map: | |
208 pepseq = pep_map[input_line][0] | |
209 aa_changes = pep_muts[pepseq] | |
210 transcript_id = cells[headers.index('S.O. transcript')] | |
211 ref_fullseq = getSequence(transcript_id) | |
212 # Checks the other S.O. transcripts if the primary S.O. transcript has no sequence available | |
213 if not ref_fullseq: | |
214 transcripts = cells[headers.index('S.O. all transcripts')] | |
215 for transcript in transcripts.split(','): | |
216 if transcript: | |
217 mat = SOtranscripts.search(transcript) | |
218 ref_fullseq = getSequence(mat.group(1)) | |
219 if ref_fullseq: | |
220 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} | |
221 break | |
222 # Resubmits the previous transcripts without extensions if all S.O. transcripts fail to provide a sequence | |
223 if not ref_fullseq: | |
224 transcripts = cells[headers.index('S.O. all transcripts')] | |
225 for transcript in transcripts.split(','): | |
226 if transcript: | |
227 mat = SOtranscripts.search(transcript) | |
228 ref_fullseq = getSequence(mat.group(1).split('.')[0]) | |
229 if ref_fullseq: | |
230 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} | |
231 break | |
232 if ref_fullseq: | |
233 # Sorts the amino acid changes | |
234 positions = {} | |
235 for aa_change in aa_changes: | |
236 m = reg_seq_change.search(aa_change) | |
237 aa_protpos = int(m.group(2)) | |
238 aa_peppos = aa_changes[aa_change][0] | |
239 aa_startpos = aa_protpos - aa_peppos - 1 | |
240 if aa_startpos in positions: | |
241 positions[aa_startpos].append(aa_change) | |
242 else: | |
243 positions[aa_startpos] = [aa_change] | |
244 # Goes through the sorted categories to mutate the Ensembl peptide (uses proBED peptide as a reference) | |
245 for pep_protpos in positions: | |
246 ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)] | |
247 muts = positions[pep_protpos] | |
248 options = [] | |
249 mut_seq = ref_seq | |
250 for mut in muts: | |
251 m = reg_seq_change.search(mut) | |
252 ref_aa = m.group(1) | |
253 mut_pos = int(m.group(2)) | |
254 alt_aa = m.group(3) | |
255 pep_mutpos = mut_pos - pep_protpos - 1 | |
256 if ref_seq[pep_mutpos] == ref_aa and (pepseq[pep_mutpos] == alt_aa or pepseq[pep_mutpos] == ref_aa): | |
257 if pepseq[pep_mutpos] == ref_aa: | |
258 mut_seq = mut_seq[:pep_mutpos] + ref_aa + mut_seq[pep_mutpos+1:] | |
259 else: | |
260 mut_seq = mut_seq[:pep_mutpos] + alt_aa + mut_seq[pep_mutpos+1:] | |
261 else: | |
262 break | |
263 # Adds the mutated peptide and reference peptide if mutated correctly | |
264 if pepseq == mut_seq: | |
265 cells[n+1] = pepseq | |
266 cells[n] = ref_seq | |
267 #print cells | |
165 tsvout.writerow(cells) | 268 tsvout.writerow(cells) |
166 | 269 |
167 | 270 |
168 | 271 |
169 | |
170 | |
171 | |
172 | |
173 | |
174 | |
175 | |
176 | |
177 | |
178 | |
179 | |
180 | |
181 | |
182 | |
183 | |
184 | |
185 | |
186 | |
187 | |
188 | |
189 | |
190 | |
191 | |
192 | |
193 | 272 |
194 | 273 |
195 | 274 |
196 #a = 'col1\tcol2\tcol3' | 275 #a = 'col1\tcol2\tcol3' |
197 #header_list = a.split('\t') | 276 #header_list = a.split('\t') |