comparison cravat_submit.py @ 2:676c8be98be4 draft

Uploaded
author rsajulga
date Thu, 17 May 2018 22:32:25 -0400
parents 7ebdd4ac13a2
children
comparison
equal deleted inserted replaced
1:6e90e7ad5e09 2:676c8be98be4
3 import time 3 import time
4 import urllib 4 import urllib
5 import sys 5 import sys
6 import csv 6 import csv
7 import re 7 import re
8 import math
9 from difflib import SequenceMatcher
10 from xml.etree import ElementTree as ET
11 import sqlite3
8 12
9 try: 13 try:
10 input_filename = sys.argv[1] 14 input_filename = sys.argv[1]
11 input_select_bar = sys.argv[2] 15 input_select_bar = sys.argv[2]
12 GRCh_build = sys.argv[3] 16 GRCh_build = sys.argv[3]
13 psm_filename = sys.argv[4] 17 probed_filename = sys.argv[4]
14 output_filename = sys.argv[5] 18 output_filename = sys.argv[5]
15 file_3 = sys.argv[6] 19 file_3 = sys.argv[6]
16 file_4 = sys.argv[7] 20 file_4 = sys.argv[7]
17 file_5 = sys.argv[8] 21 file_5 = sys.argv[8]
18 except: 22 except:
19 input_filename = '1.) Galaxy2-[Human_Vcf_MCF7]-minimum.vcf' 23 # Filenames for testing.
20 input_filename = 'input/[tgriffin_cguerrer_20160726_MCF7_RNAseq_01_S13_R1_001.vcf].vcf' 24 input_filename = 'input/Galaxy68-[VCF-BEDintersect__on_data_65_and_data_6].vcf'
21 input_select_bar = 'VEST' 25 input_select_bar = 'VEST'
22 GRCh_build = 'GRCh38' 26 GRCh_build = 'GRCh38'
23 output_filename = 'combined_variants.tsv' 27 output_filename = 'combined_variants.tsv'
24 psm_filename = 'input/[ERLIC_MCF7_110kb_R123-CustomProDB_RNA-Seq_cRAP_DB.psm-report].tabular' 28 probed_filename = 'input/Galaxy66-[PepPointer].bed'
25 file_3 = 'output/' + time.strftime("%H:%M") + '_Z_Gene_Level_Analysis.tsv' 29 file_3 = 'output/Gene_Level_Analysis.tsv'
26 file_4 = 'output/' + time.strftime("%H:%M") + '_Z_Variant_Non-coding.Result.tsv' 30 file_4 = 'output/Variant_Non-coding.Result.tsv'
27 file_5 = 'output/' + time.strftime("%H:%M") + '_Z_Input_Errors.Result.tsv' 31 file_5 = 'output/Input_Errors.Result.tsv'
28 32 matches_filename = 'matches.tsv'
29 33
30 #in_file = open('input_call.txt', "r") 34 def getSequence(transcript_id):
31 #out_file = open('output_call.txt', "w") 35 server = 'http://rest.ensembl.org'
36 ext = '/sequence/id/' + transcript_id + '?content-type=text/x-seqxml%2Bxml;multiple_sequences=1;type=protein'
37 req = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})
38
39 if not req.ok:
40 return None
41
42 root = ET.fromstring(req.content)
43 for child in root.iter('AAseq'):
44 return child.text
45
32 46
33 write_header = True 47 write_header = True
34 48
35 GRCh37hg19 = 'off' 49 GRCh37hg19 = 'off'
36 if GRCh_build == 'GRCh37': 50 if GRCh_build == 'GRCh37':
37 GRCh37hg19 = 'on' 51 GRCh37hg19 = 'on'
38 52
39 # http://staging.cravat.us/CRAVAT/rest/service/submit:
40
41 #plugs in params to given URL 53 #plugs in params to given URL
42 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19}) 54 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19})
43 55
44 #Makes the data a json dictionary, takes out only the job ID 56 #Makes the data a json dictionary, takes out only the job ID
45 jobid = json.loads(submit.text)['jobid'] 57 jobid = json.loads(submit.text)['jobid']
58
46 #out_file.write(jobid) 59 #out_file.write(jobid)
47 submitted = json.loads(submit.text)['status'] 60 submitted = json.loads(submit.text)['status']
48 #out_file.write('\t' + submitted) 61 #out_file.write('\t' + submitted)
49 62
50 input_file = open(input_filename) 63 input_file = open(input_filename)
51 is_comment_line = re.compile(".*#+.*") 64
52 65 # Loads the proBED file as a list.
53 66 if (probed_filename != 'None'):
67 proBED = []
68 with open(probed_filename) as tsvin:
69 tsvreader = csv.reader(tsvin, delimiter='\t')
70 for i, row in enumerate(tsvreader):
71 proBED.append(row)
72
54 #loops until we find a status equal to Success, then breaks 73 #loops until we find a status equal to Success, then breaks
55 while True: 74 while True:
56 check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid}) 75 check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid})
57 status = json.loads(check.text)['status'] 76 status = json.loads(check.text)['status']
58 resultfileurl = json.loads(check.text)['resultfileurl'] 77 resultfileurl = json.loads(check.text)['resultfileurl']
60 if status == 'Success': 79 if status == 'Success':
61 #out_file.write('\t' + resultfileurl) 80 #out_file.write('\t' + resultfileurl)
62 break 81 break
63 else: 82 else:
64 time.sleep(2) 83 time.sleep(2)
65 84
66 #out_file.write('\n') 85 #out_file.write('\n')
67 86
68 #creates three files 87 #creates three files
69 file_1 = time.strftime("%H:%M") + '_Z_Variant_Result.tsv' 88 file_1 = 'Variant_Result.tsv'
70 file_2 = time.strftime("%H:%M") + '_Z_Additional_Details.tsv' 89 file_2 = 'Additional_Details.tsv'
71 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv' 90 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv'
72 91
73 #Download the two results 92 #Downloads the tabular results
74 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1) 93 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1)
75 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2) 94 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2)
76 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3) 95 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3)
77 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4) 96 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4)
78 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5) 97 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5)
79 98
80 headers = []
81 duplicates = []
82
83 #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer 99 #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer
100 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
101 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
102 tsvout = csv.writer(tsvout, delimiter='\t')
103
104 headers = []
105 duplicate_indices = []
106 n = 12 #Index for proteogenomic column start
107 reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)')
108 SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)')
109 pep_muts = {}
110 pep_map = {}
111 rows = []
112
113 for row in tsvreader_2:
114 if row != [] and row[0][0] != '#':
115 #checks if the row begins with input line
116 if row[0] == 'Input line':
117 vad_headers = row
118 else:
119 # Initially screens through the output Variant Additional Details to catch mutations on same peptide region
120 genchrom = row[vad_headers.index('Chromosome')]
121 genpos = int(row[vad_headers.index('Position')])
122 aa_change = row[vad_headers.index('Protein sequence change')]
123 input_line = row[vad_headers.index('Input line')]
124
125 for peptide in proBED:
126 pepseq = peptide[3]
127 pepchrom = peptide[0]
128 pepposA = int(peptide[1])
129 pepposB = int(peptide[2])
130 if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB:
131 strand = row[vad_headers.index('Strand')]
132 transcript_strand = row[vad_headers.index('S.O. transcript strand')]
133
134 # Calculates the position of the variant amino acid(s) on peptide
135 if transcript_strand == strand:
136 aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1)
137 if strand == '-' or transcript_strand == '-' or aa_peppos >= len(pepseq):
138 aa_peppos = int(math.floor((pepposB - genpos)/3.0))
139 if pepseq in pep_muts:
140 if aa_change not in pep_muts[pepseq]:
141 pep_muts[pepseq][aa_change] = [aa_peppos]
142 else:
143 if aa_peppos not in pep_muts[pepseq][aa_change]:
144 pep_muts[pepseq][aa_change].append(aa_peppos)
145 else:
146 pep_muts[pepseq] = {aa_change : [aa_peppos]}
147 # Stores the intersect information by mapping Input Line (CRAVAT output) to peptide sequence.
148 if input_line in pep_map:
149 if pepseq not in pep_map[input_line]:
150 pep_map[input_line].append(pepseq)
151 else:
152 pep_map[input_line] = [pepseq]
153
84 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: 154 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
85 tsvreader_1 = csv.reader(tsvin_1, delimiter='\t') 155 tsvreader_1 = csv.reader(tsvin_1, delimiter='\t')
86 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') 156 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
87
88 tsvout = csv.writer(tsvout, delimiter='\t') 157 tsvout = csv.writer(tsvout, delimiter='\t')
89 158
90 # Processes the PSM report 159 headers = []
91 if (psm_filename != 'None'): 160
92 tsvin_3 = open(psm_filename) 161 #loops through each row in the Variant Additional Details (VAD) file
93 psmreader = csv.reader(tsvin_3, delimiter='\t') 162 for row in tsvreader_2:
94 163
95 psmreader.next() 164 #sets row_2 equal to the same row in Variant Result (VR) file
96 peptide_map = {}
97 s = re.compile('[A-Z][0-9]+[A-Z]')
98 for row in psmreader:
99 pro_name = row[1]
100 pep_seq = row[2]
101
102 prot_seq_changes = s.findall(pro_name)
103
104 for change in prot_seq_changes:
105 if change in peptide_map:
106 if pep_seq not in peptide_map[change].split(';'):
107 peptide_map[change] = peptide_map[change] + ';' + pep_seq
108 else:
109 peptide_map[change] = pep_seq
110
111 #loops through each row in the Variant Additional Details file
112
113 print 'Checkpoint 3'
114 for row in tsvreader_2:
115 #sets row_2 equal to the same row in Variant Result file
116 row_2 = tsvreader_1.next() 165 row_2 = tsvreader_1.next()
117 #checks if row is empty or if the first term contains '#' 166 #checks if row is empty or if the first term contains '#'
118 if row == [] or row[0][0] == '#': 167 if row == [] or row[0][0] == '#':
119 tsvout.writerow(row) 168 tsvout.writerow(row)
120 else: 169 else:
121 #checks if the row begins with input line 170 if row[0] == 'Input line':
122 if row[0] == 'Input line':
123 #Goes through each value in the headers list in VAD 171 #Goes through each value in the headers list in VAD
124 #print 'Original row'
125 #print row
126 #print row_2
127 for value in row: 172 for value in row:
128 #Adds each value into headers 173 #Adds each value into headers
129 headers.append(value) 174 headers.append(value)
130 #Loops through the Keys in VR 175 #Loops through the Keys in VR
131 for value in row_2: 176 for i,value in enumerate(row_2):
132 #Checks if the value is already in headers 177 #Checks if the value is already in headers
133 if value in headers: 178 if value in headers:
179 duplicate_indices.append(i)
134 continue 180 continue
135 #else adds the header to headers 181 #else adds the header to headers
136 else: 182 else:
137 headers.append(value) 183 headers.append(value)
138 if (psm_filename != 'None'): 184 #Adds appropriate headers when proteomic input is supplied
139 headers.insert(1, 'Peptide') 185 if (probed_filename != 'None'):
140 #print headers 186 headers.insert(n, 'Variant peptide')
187 headers.insert(n, 'Reference peptide')
141 tsvout.writerow(headers) 188 tsvout.writerow(headers)
142 else: 189 else:
143
144 cells = [] 190 cells = []
145 #Inserts a peptide column into the row
146 if (psm_filename != 'None'):
147 if row[12] in peptide_map:
148 row.insert(1, peptide_map[row[12]])
149 else:
150 row.insert(1, '')
151
152 #Goes through each value in the next list 191 #Goes through each value in the next list
153 for i,value in enumerate(row): 192 for value in row:
154 #adds it to cells 193 #adds it to cells
155 cells.append(value) 194 cells.append(value)
156 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file) 195 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file)
157 for i,value in enumerate(row_2[11:]): 196 for i,value in enumerate(row_2):
158 #adds in the rest of the values to cells 197 #adds in the rest of the values to cells
159 198 if i not in duplicate_indices:
160 # Skips the 2nd VEST p-value 199 # Skips the initial 11 columns and the VEST p-value (already in VR file)
161 if (i != 49 - 11):
162 cells.append(value) 200 cells.append(value)
163 201
164 print cells 202 # Verifies the peptides intersected previously through sequences obtained from Ensembl's API
203 if (probed_filename != 'None'):
204 cells.insert(n,'')
205 cells.insert(n,'')
206 input_line = cells[headers.index('Input line')]
207 if input_line in pep_map:
208 pepseq = pep_map[input_line][0]
209 aa_changes = pep_muts[pepseq]
210 transcript_id = cells[headers.index('S.O. transcript')]
211 ref_fullseq = getSequence(transcript_id)
212 # Checks the other S.O. transcripts if the primary S.O. transcript has no sequence available
213 if not ref_fullseq:
214 transcripts = cells[headers.index('S.O. all transcripts')]
215 for transcript in transcripts.split(','):
216 if transcript:
217 mat = SOtranscripts.search(transcript)
218 ref_fullseq = getSequence(mat.group(1))
219 if ref_fullseq:
220 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
221 break
222 # Resubmits the previous transcripts without extensions if all S.O. transcripts fail to provide a sequence
223 if not ref_fullseq:
224 transcripts = cells[headers.index('S.O. all transcripts')]
225 for transcript in transcripts.split(','):
226 if transcript:
227 mat = SOtranscripts.search(transcript)
228 ref_fullseq = getSequence(mat.group(1).split('.')[0])
229 if ref_fullseq:
230 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
231 break
232 if ref_fullseq:
233 # Sorts the amino acid changes
234 positions = {}
235 for aa_change in aa_changes:
236 m = reg_seq_change.search(aa_change)
237 aa_protpos = int(m.group(2))
238 aa_peppos = aa_changes[aa_change][0]
239 aa_startpos = aa_protpos - aa_peppos - 1
240 if aa_startpos in positions:
241 positions[aa_startpos].append(aa_change)
242 else:
243 positions[aa_startpos] = [aa_change]
244 # Goes through the sorted categories to mutate the Ensembl peptide (uses proBED peptide as a reference)
245 for pep_protpos in positions:
246 ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)]
247 muts = positions[pep_protpos]
248 options = []
249 mut_seq = ref_seq
250 for mut in muts:
251 m = reg_seq_change.search(mut)
252 ref_aa = m.group(1)
253 mut_pos = int(m.group(2))
254 alt_aa = m.group(3)
255 pep_mutpos = mut_pos - pep_protpos - 1
256 if ref_seq[pep_mutpos] == ref_aa and (pepseq[pep_mutpos] == alt_aa or pepseq[pep_mutpos] == ref_aa):
257 if pepseq[pep_mutpos] == ref_aa:
258 mut_seq = mut_seq[:pep_mutpos] + ref_aa + mut_seq[pep_mutpos+1:]
259 else:
260 mut_seq = mut_seq[:pep_mutpos] + alt_aa + mut_seq[pep_mutpos+1:]
261 else:
262 break
263 # Adds the mutated peptide and reference peptide if mutated correctly
264 if pepseq == mut_seq:
265 cells[n+1] = pepseq
266 cells[n] = ref_seq
267 #print cells
165 tsvout.writerow(cells) 268 tsvout.writerow(cells)
166 269
167 270
168 271
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193 272
194 273
195 274
196 #a = 'col1\tcol2\tcol3' 275 #a = 'col1\tcol2\tcol3'
197 #header_list = a.split('\t') 276 #header_list = a.split('\t')