Mercurial > repos > rsajulga > proteomic_cravat_score_and_annotate
comparison cravat_submit.py @ 0:7ebdd4ac13a2 draft
Uploaded
author | rsajulga |
---|---|
date | Tue, 10 Apr 2018 15:53:55 -0400 |
parents | |
children | 676c8be98be4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7ebdd4ac13a2 |
---|---|
1 import requests | |
2 import json | |
3 import time | |
4 import urllib | |
5 import sys | |
6 import csv | |
7 import re | |
8 | |
9 try: | |
10 input_filename = sys.argv[1] | |
11 input_select_bar = sys.argv[2] | |
12 GRCh_build = sys.argv[3] | |
13 psm_filename = sys.argv[4] | |
14 output_filename = sys.argv[5] | |
15 file_3 = sys.argv[6] | |
16 file_4 = sys.argv[7] | |
17 file_5 = sys.argv[8] | |
18 except: | |
19 input_filename = '1.) Galaxy2-[Human_Vcf_MCF7]-minimum.vcf' | |
20 input_filename = 'input/[tgriffin_cguerrer_20160726_MCF7_RNAseq_01_S13_R1_001.vcf].vcf' | |
21 input_select_bar = 'VEST' | |
22 GRCh_build = 'GRCh38' | |
23 output_filename = 'combined_variants.tsv' | |
24 psm_filename = 'input/[ERLIC_MCF7_110kb_R123-CustomProDB_RNA-Seq_cRAP_DB.psm-report].tabular' | |
25 file_3 = 'output/' + time.strftime("%H:%M") + '_Z_Gene_Level_Analysis.tsv' | |
26 file_4 = 'output/' + time.strftime("%H:%M") + '_Z_Variant_Non-coding.Result.tsv' | |
27 file_5 = 'output/' + time.strftime("%H:%M") + '_Z_Input_Errors.Result.tsv' | |
28 | |
29 | |
30 #in_file = open('input_call.txt', "r") | |
31 #out_file = open('output_call.txt', "w") | |
32 | |
33 write_header = True | |
34 | |
35 GRCh37hg19 = 'off' | |
36 if GRCh_build == 'GRCh37': | |
37 GRCh37hg19 = 'on' | |
38 | |
39 # http://staging.cravat.us/CRAVAT/rest/service/submit: | |
40 | |
41 #plugs in params to given URL | |
42 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19}) | |
43 | |
44 #Makes the data a json dictionary, takes out only the job ID | |
45 jobid = json.loads(submit.text)['jobid'] | |
46 #out_file.write(jobid) | |
47 submitted = json.loads(submit.text)['status'] | |
48 #out_file.write('\t' + submitted) | |
49 | |
50 input_file = open(input_filename) | |
51 is_comment_line = re.compile(".*#+.*") | |
52 | |
53 | |
54 #loops until we find a status equal to Success, then breaks | |
55 while True: | |
56 check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid}) | |
57 status = json.loads(check.text)['status'] | |
58 resultfileurl = json.loads(check.text)['resultfileurl'] | |
59 #out_file.write(str(status) + ', ') | |
60 if status == 'Success': | |
61 #out_file.write('\t' + resultfileurl) | |
62 break | |
63 else: | |
64 time.sleep(2) | |
65 | |
66 #out_file.write('\n') | |
67 | |
68 #creates three files | |
69 file_1 = time.strftime("%H:%M") + '_Z_Variant_Result.tsv' | |
70 file_2 = time.strftime("%H:%M") + '_Z_Additional_Details.tsv' | |
71 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv' | |
72 | |
73 #Download the two results | |
74 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1) | |
75 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2) | |
76 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3) | |
77 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4) | |
78 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5) | |
79 | |
80 headers = [] | |
81 duplicates = [] | |
82 | |
83 #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer | |
84 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: | |
85 tsvreader_1 = csv.reader(tsvin_1, delimiter='\t') | |
86 tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') | |
87 | |
88 tsvout = csv.writer(tsvout, delimiter='\t') | |
89 | |
90 # Processes the PSM report | |
91 if (psm_filename != 'None'): | |
92 tsvin_3 = open(psm_filename) | |
93 psmreader = csv.reader(tsvin_3, delimiter='\t') | |
94 | |
95 psmreader.next() | |
96 peptide_map = {} | |
97 s = re.compile('[A-Z][0-9]+[A-Z]') | |
98 for row in psmreader: | |
99 pro_name = row[1] | |
100 pep_seq = row[2] | |
101 | |
102 prot_seq_changes = s.findall(pro_name) | |
103 | |
104 for change in prot_seq_changes: | |
105 if change in peptide_map: | |
106 if pep_seq not in peptide_map[change].split(';'): | |
107 peptide_map[change] = peptide_map[change] + ';' + pep_seq | |
108 else: | |
109 peptide_map[change] = pep_seq | |
110 | |
111 #loops through each row in the Variant Additional Details file | |
112 | |
113 print 'Checkpoint 3' | |
114 for row in tsvreader_2: | |
115 #sets row_2 equal to the same row in Variant Result file | |
116 row_2 = tsvreader_1.next() | |
117 #checks if row is empty or if the first term contains '#' | |
118 if row == [] or row[0][0] == '#': | |
119 tsvout.writerow(row) | |
120 else: | |
121 #checks if the row begins with input line | |
122 if row[0] == 'Input line': | |
123 #Goes through each value in the headers list in VAD | |
124 #print 'Original row' | |
125 #print row | |
126 #print row_2 | |
127 for value in row: | |
128 #Adds each value into headers | |
129 headers.append(value) | |
130 #Loops through the Keys in VR | |
131 for value in row_2: | |
132 #Checks if the value is already in headers | |
133 if value in headers: | |
134 continue | |
135 #else adds the header to headers | |
136 else: | |
137 headers.append(value) | |
138 if (psm_filename != 'None'): | |
139 headers.insert(1, 'Peptide') | |
140 #print headers | |
141 tsvout.writerow(headers) | |
142 else: | |
143 | |
144 cells = [] | |
145 #Inserts a peptide column into the row | |
146 if (psm_filename != 'None'): | |
147 if row[12] in peptide_map: | |
148 row.insert(1, peptide_map[row[12]]) | |
149 else: | |
150 row.insert(1, '') | |
151 | |
152 #Goes through each value in the next list | |
153 for i,value in enumerate(row): | |
154 #adds it to cells | |
155 cells.append(value) | |
156 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file) | |
157 for i,value in enumerate(row_2[11:]): | |
158 #adds in the rest of the values to cells | |
159 | |
160 # Skips the 2nd VEST p-value | |
161 if (i != 49 - 11): | |
162 cells.append(value) | |
163 | |
164 print cells | |
165 tsvout.writerow(cells) | |
166 | |
167 | |
168 | |
169 | |
170 | |
171 | |
172 | |
173 | |
174 | |
175 | |
176 | |
177 | |
178 | |
179 | |
180 | |
181 | |
182 | |
183 | |
184 | |
185 | |
186 | |
187 | |
188 | |
189 | |
190 | |
191 | |
192 | |
193 | |
194 | |
195 | |
196 #a = 'col1\tcol2\tcol3' | |
197 #header_list = a.split('\t') | |
198 | |
199 #loop through the two results, when you first hit header you print out the headers in tabular form | |
200 #Print out each header only once | |
201 #Combine both headers into one output file | |
202 #loop through the rest of the data and assign each value to its assigned header | |
203 #combine this all into one output file | |
204 | |
205 | |
206 | |
207 | |
208 |