comparison cravat_submit.py @ 0:7ebdd4ac13a2 draft

Uploaded
author rsajulga
date Tue, 10 Apr 2018 15:53:55 -0400
parents
children 676c8be98be4
comparison
equal deleted inserted replaced
-1:000000000000 0:7ebdd4ac13a2
1 import requests
2 import json
3 import time
4 import urllib
5 import sys
6 import csv
7 import re
8
9 try:
10 input_filename = sys.argv[1]
11 input_select_bar = sys.argv[2]
12 GRCh_build = sys.argv[3]
13 psm_filename = sys.argv[4]
14 output_filename = sys.argv[5]
15 file_3 = sys.argv[6]
16 file_4 = sys.argv[7]
17 file_5 = sys.argv[8]
18 except:
19 input_filename = '1.) Galaxy2-[Human_Vcf_MCF7]-minimum.vcf'
20 input_filename = 'input/[tgriffin_cguerrer_20160726_MCF7_RNAseq_01_S13_R1_001.vcf].vcf'
21 input_select_bar = 'VEST'
22 GRCh_build = 'GRCh38'
23 output_filename = 'combined_variants.tsv'
24 psm_filename = 'input/[ERLIC_MCF7_110kb_R123-CustomProDB_RNA-Seq_cRAP_DB.psm-report].tabular'
25 file_3 = 'output/' + time.strftime("%H:%M") + '_Z_Gene_Level_Analysis.tsv'
26 file_4 = 'output/' + time.strftime("%H:%M") + '_Z_Variant_Non-coding.Result.tsv'
27 file_5 = 'output/' + time.strftime("%H:%M") + '_Z_Input_Errors.Result.tsv'
28
29
30 #in_file = open('input_call.txt', "r")
31 #out_file = open('output_call.txt', "w")
32
33 write_header = True
34
35 GRCh37hg19 = 'off'
36 if GRCh_build == 'GRCh37':
37 GRCh37hg19 = 'on'
38
39 # http://staging.cravat.us/CRAVAT/rest/service/submit:
40
41 #plugs in params to given URL
42 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19})
43
44 #Makes the data a json dictionary, takes out only the job ID
45 jobid = json.loads(submit.text)['jobid']
46 #out_file.write(jobid)
47 submitted = json.loads(submit.text)['status']
48 #out_file.write('\t' + submitted)
49
50 input_file = open(input_filename)
51 is_comment_line = re.compile(".*#+.*")
52
53
54 #loops until we find a status equal to Success, then breaks
55 while True:
56 check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid})
57 status = json.loads(check.text)['status']
58 resultfileurl = json.loads(check.text)['resultfileurl']
59 #out_file.write(str(status) + ', ')
60 if status == 'Success':
61 #out_file.write('\t' + resultfileurl)
62 break
63 else:
64 time.sleep(2)
65
66 #out_file.write('\n')
67
68 #creates three files
69 file_1 = time.strftime("%H:%M") + '_Z_Variant_Result.tsv'
70 file_2 = time.strftime("%H:%M") + '_Z_Additional_Details.tsv'
71 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv'
72
73 #Download the two results
74 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1)
75 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2)
76 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3)
77 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4)
78 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5)
79
80 headers = []
81 duplicates = []
82
83 #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer
84 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
85 tsvreader_1 = csv.reader(tsvin_1, delimiter='\t')
86 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
87
88 tsvout = csv.writer(tsvout, delimiter='\t')
89
90 # Processes the PSM report
91 if (psm_filename != 'None'):
92 tsvin_3 = open(psm_filename)
93 psmreader = csv.reader(tsvin_3, delimiter='\t')
94
95 psmreader.next()
96 peptide_map = {}
97 s = re.compile('[A-Z][0-9]+[A-Z]')
98 for row in psmreader:
99 pro_name = row[1]
100 pep_seq = row[2]
101
102 prot_seq_changes = s.findall(pro_name)
103
104 for change in prot_seq_changes:
105 if change in peptide_map:
106 if pep_seq not in peptide_map[change].split(';'):
107 peptide_map[change] = peptide_map[change] + ';' + pep_seq
108 else:
109 peptide_map[change] = pep_seq
110
111 #loops through each row in the Variant Additional Details file
112
113 print 'Checkpoint 3'
114 for row in tsvreader_2:
115 #sets row_2 equal to the same row in Variant Result file
116 row_2 = tsvreader_1.next()
117 #checks if row is empty or if the first term contains '#'
118 if row == [] or row[0][0] == '#':
119 tsvout.writerow(row)
120 else:
121 #checks if the row begins with input line
122 if row[0] == 'Input line':
123 #Goes through each value in the headers list in VAD
124 #print 'Original row'
125 #print row
126 #print row_2
127 for value in row:
128 #Adds each value into headers
129 headers.append(value)
130 #Loops through the Keys in VR
131 for value in row_2:
132 #Checks if the value is already in headers
133 if value in headers:
134 continue
135 #else adds the header to headers
136 else:
137 headers.append(value)
138 if (psm_filename != 'None'):
139 headers.insert(1, 'Peptide')
140 #print headers
141 tsvout.writerow(headers)
142 else:
143
144 cells = []
145 #Inserts a peptide column into the row
146 if (psm_filename != 'None'):
147 if row[12] in peptide_map:
148 row.insert(1, peptide_map[row[12]])
149 else:
150 row.insert(1, '')
151
152 #Goes through each value in the next list
153 for i,value in enumerate(row):
154 #adds it to cells
155 cells.append(value)
156 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file)
157 for i,value in enumerate(row_2[11:]):
158 #adds in the rest of the values to cells
159
160 # Skips the 2nd VEST p-value
161 if (i != 49 - 11):
162 cells.append(value)
163
164 print cells
165 tsvout.writerow(cells)
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196 #a = 'col1\tcol2\tcol3'
197 #header_list = a.split('\t')
198
199 #loop through the two results, when you first hit header you print out the headers in tabular form
200 #Print out each header only once
201 #Combine both headers into one output file
202 #loop through the rest of the data and assign each value to its assigned header
203 #combine this all into one output file
204
205
206
207
208