annotate cravat_submit.py @ 0:7ebdd4ac13a2 draft

Uploaded
author rsajulga
date Tue, 10 Apr 2018 15:53:55 -0400 (2018-04-10)
parents
children 676c8be98be4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
1 import requests
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
2 import json
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
3 import time
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
4 import urllib
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
5 import sys
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
6 import csv
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
7 import re
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
8
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
9 try:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
10 input_filename = sys.argv[1]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
11 input_select_bar = sys.argv[2]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
12 GRCh_build = sys.argv[3]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
13 psm_filename = sys.argv[4]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
14 output_filename = sys.argv[5]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
15 file_3 = sys.argv[6]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
16 file_4 = sys.argv[7]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
17 file_5 = sys.argv[8]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
18 except:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
19 input_filename = '1.) Galaxy2-[Human_Vcf_MCF7]-minimum.vcf'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
20 input_filename = 'input/[tgriffin_cguerrer_20160726_MCF7_RNAseq_01_S13_R1_001.vcf].vcf'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
21 input_select_bar = 'VEST'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
22 GRCh_build = 'GRCh38'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
23 output_filename = 'combined_variants.tsv'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
24 psm_filename = 'input/[ERLIC_MCF7_110kb_R123-CustomProDB_RNA-Seq_cRAP_DB.psm-report].tabular'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
25 file_3 = 'output/' + time.strftime("%H:%M") + '_Z_Gene_Level_Analysis.tsv'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
26 file_4 = 'output/' + time.strftime("%H:%M") + '_Z_Variant_Non-coding.Result.tsv'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
27 file_5 = 'output/' + time.strftime("%H:%M") + '_Z_Input_Errors.Result.tsv'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
28
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
29
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
30 #in_file = open('input_call.txt', "r")
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
31 #out_file = open('output_call.txt', "w")
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
32
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
33 write_header = True
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
34
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
35 GRCh37hg19 = 'off'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
36 if GRCh_build == 'GRCh37':
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
37 GRCh37hg19 = 'on'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
38
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
39 # http://staging.cravat.us/CRAVAT/rest/service/submit:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
40
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
41 #plugs in params to given URL
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
42 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19})
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
43
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
44 #Makes the data a json dictionary, takes out only the job ID
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
45 jobid = json.loads(submit.text)['jobid']
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
46 #out_file.write(jobid)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
47 submitted = json.loads(submit.text)['status']
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
48 #out_file.write('\t' + submitted)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
49
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
50 input_file = open(input_filename)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
51 is_comment_line = re.compile(".*#+.*")
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
52
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
53
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
54 #loops until we find a status equal to Success, then breaks
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
55 while True:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
56 check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid})
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
57 status = json.loads(check.text)['status']
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
58 resultfileurl = json.loads(check.text)['resultfileurl']
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
59 #out_file.write(str(status) + ', ')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
60 if status == 'Success':
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
61 #out_file.write('\t' + resultfileurl)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
62 break
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
63 else:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
64 time.sleep(2)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
65
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
66 #out_file.write('\n')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
67
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
68 #creates three files
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
69 file_1 = time.strftime("%H:%M") + '_Z_Variant_Result.tsv'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
70 file_2 = time.strftime("%H:%M") + '_Z_Additional_Details.tsv'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
71 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
72
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
73 #Download the two results
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
74 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
75 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
76 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
77 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
78 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
79
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
80 headers = []
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
81 duplicates = []
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
82
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
83 #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
84 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
85 tsvreader_1 = csv.reader(tsvin_1, delimiter='\t')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
86 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
87
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
88 tsvout = csv.writer(tsvout, delimiter='\t')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
89
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
90 # Processes the PSM report
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
91 if (psm_filename != 'None'):
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
92 tsvin_3 = open(psm_filename)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
93 psmreader = csv.reader(tsvin_3, delimiter='\t')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
94
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
95 psmreader.next()
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
96 peptide_map = {}
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
97 s = re.compile('[A-Z][0-9]+[A-Z]')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
98 for row in psmreader:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
99 pro_name = row[1]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
100 pep_seq = row[2]
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
101
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
102 prot_seq_changes = s.findall(pro_name)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
103
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
104 for change in prot_seq_changes:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
105 if change in peptide_map:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
106 if pep_seq not in peptide_map[change].split(';'):
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
107 peptide_map[change] = peptide_map[change] + ';' + pep_seq
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
108 else:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
109 peptide_map[change] = pep_seq
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
110
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
111 #loops through each row in the Variant Additional Details file
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
112
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
113 print 'Checkpoint 3'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
114 for row in tsvreader_2:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
115 #sets row_2 equal to the same row in Variant Result file
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
116 row_2 = tsvreader_1.next()
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
117 #checks if row is empty or if the first term contains '#'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
118 if row == [] or row[0][0] == '#':
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
119 tsvout.writerow(row)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
120 else:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
121 #checks if the row begins with input line
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
122 if row[0] == 'Input line':
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
123 #Goes through each value in the headers list in VAD
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
124 #print 'Original row'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
125 #print row
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
126 #print row_2
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
127 for value in row:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
128 #Adds each value into headers
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
129 headers.append(value)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
130 #Loops through the Keys in VR
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
131 for value in row_2:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
132 #Checks if the value is already in headers
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
133 if value in headers:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
134 continue
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
135 #else adds the header to headers
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
136 else:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
137 headers.append(value)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
138 if (psm_filename != 'None'):
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
139 headers.insert(1, 'Peptide')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
140 #print headers
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
141 tsvout.writerow(headers)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
142 else:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
143
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
144 cells = []
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
145 #Inserts a peptide column into the row
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
146 if (psm_filename != 'None'):
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
147 if row[12] in peptide_map:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
148 row.insert(1, peptide_map[row[12]])
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
149 else:
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
150 row.insert(1, '')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
151
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
152 #Goes through each value in the next list
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
153 for i,value in enumerate(row):
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
154 #adds it to cells
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
155 cells.append(value)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
156 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
157 for i,value in enumerate(row_2[11:]):
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
158 #adds in the rest of the values to cells
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
159
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
160 # Skips the 2nd VEST p-value
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
161 if (i != 49 - 11):
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
162 cells.append(value)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
163
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
164 print cells
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
165 tsvout.writerow(cells)
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
166
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
167
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
168
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
169
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
170
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
171
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
172
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
173
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
174
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
175
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
176
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
177
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
178
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
179
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
180
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
181
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
182
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
183
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
184
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
185
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
186
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
187
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
188
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
189
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
190
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
191
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
192
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
193
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
194
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
195
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
196 #a = 'col1\tcol2\tcol3'
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
197 #header_list = a.split('\t')
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
198
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
199 #loop through the two results, when you first hit header you print out the headers in tabular form
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
200 #Print out each header only once
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
201 #Combine both headers into one output file
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
202 #loop through the rest of the data and assign each value to its assigned header
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
203 #combine this all into one output file
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
204
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
205
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
206
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
207
7ebdd4ac13a2 Uploaded
rsajulga
parents:
diff changeset
208