Mercurial > repos > nanettec > classifier
comparison classifier/classifier.py @ 0:ef9c2044d86a draft
Uploaded
author | nanettec |
---|---|
date | Fri, 18 Mar 2016 05:15:29 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ef9c2044d86a |
---|---|
1 """ | |
2 @summary: Classify eQTLs as cis or trans - calculate "hybrid" overlap score | |
3 @author: nanette.coetzer@gmail.com | |
4 @version 5 | |
5 | |
6 """ | |
7 import optparse, sys | |
8 import tempfile | |
9 import os, re | |
10 import subprocess | |
11 | |
12 def stop_err( msg ): | |
13 sys.stderr.write( "%s\n" % msg ) | |
14 sys.exit() | |
15 | |
16 def __main__(): | |
17 #Parse Command Line | |
18 parser = optparse.OptionParser() | |
19 parser.add_option("-m", "--rscript", default=None, dest="rscript", | |
20 help="R script") | |
21 parser.add_option("-i", "--input1", default=None, dest="input1", | |
22 help="eQTLs file") | |
23 parser.add_option("-j", "--input2", default=None, dest="input2", | |
24 help="Chr summary file") | |
25 parser.add_option("-k", "--input3", default=None, dest="input3", | |
26 help="Gene positions file") | |
27 parser.add_option("-l", "--input4", default=None, dest="input4", | |
28 help="Lookup file") | |
29 | |
30 #parser.add_option("-m", "--score_type", default=2, dest="score_type", | |
31 # help="Select dM scale (1) or hybrid (2)") | |
32 | |
33 parser.add_option("-o", "--output1", default=None, dest="output1", | |
34 help="All classification file") | |
35 parser.add_option("-p", "--output2", default=None, dest="output2", | |
36 help="Cis classification file") | |
37 parser.add_option("-q", "--output3", default=None, dest="output3", | |
38 help="Trans classification file") | |
39 parser.add_option("-r", "--output4", default=None, dest="output4", | |
40 help="Classification summary file") | |
41 parser.add_option("-s", "--output5", default=None, dest="output5", | |
42 help="Chr summary file v2") | |
43 parser.add_option("-t", "--output6", default=None, dest="output6", | |
44 help="Gene positions file v2") | |
45 parser.add_option("-u", "--output7", default=None, dest="output7", | |
46 help="eQTLs per gene summary file") | |
47 parser.add_option("-v", "--output8", default=None, dest="output8", | |
48 help="eQTL vs gene position plot") | |
49 (options, args) = parser.parse_args() | |
50 | |
51 try: | |
52 open(options.input1, "r").close() | |
53 except TypeError, e: | |
54 stop_err("You need to supply the eQTL file:\n" + str(e)) | |
55 except IOError, e: | |
56 stop_err("Can not open the eQTL file:\n" + str(e)) | |
57 | |
58 try: | |
59 open(options.input2, "r").close() | |
60 except TypeError, e: | |
61 stop_err("You need to supply the Chr summary file:\n" + str(e)) | |
62 except IOError, e: | |
63 stop_err("Can not open the Chr summary file:\n" + str(e)) | |
64 | |
65 try: | |
66 open(options.input3, "r").close() | |
67 except TypeError, e: | |
68 stop_err("You need to supply the Gene positions file:\n" + str(e)) | |
69 except IOError, e: | |
70 stop_err("Can not open the Gene positions file:\n" + str(e)) | |
71 | |
72 try: | |
73 open(options.input4, "r").close() | |
74 except TypeError, e: | |
75 stop_err("You need to supply the Lookup file:\n" + str(e)) | |
76 except IOError, e: | |
77 stop_err("Can not open the Lookup file:\n" + str(e)) | |
78 | |
79 ############################################## | |
80 lookup_list = [] | |
81 | |
82 # Convert eQTL input file, change marker number of last interval in each chromosome | |
83 check2 = "" | |
84 prev_m = "0" | |
85 prev_c = "1" | |
86 r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+") | |
87 indict = {} | |
88 max_cM_dict = {} | |
89 lookup = open(options.input4,'r') | |
90 for line in lookup: | |
91 l = line.strip().split("\t") | |
92 if l[6] == "0": | |
93 #print l | |
94 max_cM_dict[l[1]] = l[4] | |
95 lookup_list.append(l) | |
96 if l[0] != "id" and l[6] != "0": | |
97 c = l[1] | |
98 if prev_c != c: | |
99 indict[prev_c+"\t"+str(float(p))] = prev_c+"\t"+m+"\t"+str(float(p)) | |
100 m = l[2] | |
101 p = l[3] | |
102 prev_c = c | |
103 #print max_cM_dict | |
104 | |
105 #add last line of last chromosome | |
106 indict[prev_c+"\t"+p] = prev_c+"\t"+m+"\t"+p | |
107 | |
108 #print indict | |
109 | |
110 temp_file = tempfile.mktemp() | |
111 eqtl_temp_output = open(temp_file,'w') | |
112 | |
113 eQTL_size = 0 | |
114 eQTL_nr = 0 | |
115 eqtl_in = open(options.input1,'r') | |
116 for line in eqtl_in: | |
117 l = line.strip().split("\t") | |
118 if l[3].startswith("c"): | |
119 header = 1 # yes | |
120 else: | |
121 header = 0 # no | |
122 if header != 1: | |
123 sfull = l[3]+"\t"+l[13]+"\t"+str(float(l[14])) | |
124 efull = l[3]+"\t"+l[15]+"\t"+str(float(l[16])) | |
125 pfull = l[3]+"\t"+l[4]+"\t"+str(float(l[5])) | |
126 sshort = l[3]+"\t"+str(float(l[14])) | |
127 eshort = l[3]+"\t"+str(float(l[16])) | |
128 pshort = l[3]+"\t"+str(float(l[5])) | |
129 | |
130 if sshort in indict.keys(): | |
131 sfull = indict[sshort] | |
132 if eshort in indict.keys(): | |
133 efull = indict[eshort] | |
134 if pshort in indict.keys(): | |
135 pfull = indict[pshort] | |
136 s = sfull.split("\t") | |
137 e = efull.split("\t") | |
138 p = pfull.split("\t") | |
139 eqtl_temp_output.write(l[0]+"\t"+l[1]+"\t"+s[0]+"\t"+s[1]+"\t"+s[2]+"\t"+e[1]+"\t"+e[2]+"\t"+p[1]+"\t"+p[2]+"\t"+l[6]+"\t"+l[8]+"\t"+l[9]+"\t"+l[11]+"\n") | |
140 #print l[0]+"\t"+l[1]+"\t"+s[0]+"\t"+s[1]+"\t"+s[2]+"\t"+e[1]+"\t"+e[2]+"\t"+p[1]+"\t"+p[2]+"\t"+l[6]+"\t"+l[8]+"\t"+l[9]+"\t"+l[11]+"\n" | |
141 | |
142 eQTL_size += (float(l[16])*100)-(float(l[14])*100) | |
143 eQTL_nr += 1 | |
144 lookup.close() | |
145 eqtl_in.close() | |
146 eqtl_temp_output.close() | |
147 | |
148 cutoff = (eQTL_size/float(eQTL_nr))/2 | |
149 #print "cutoff: ",cutoff | |
150 ############################################# | |
151 | |
152 cis = 0 | |
153 trans = 0 | |
154 no_result = 0 | |
155 total = 0 | |
156 cis_peakLR = 0 | |
157 cis_rsq = 0 | |
158 cis_rtsq = 0 | |
159 trans_peakLR = 0 | |
160 trans_rsq = 0 | |
161 trans_rtsq = 0 | |
162 nr_peakLR = 0 | |
163 nr_rsq = 0 | |
164 nr_rtsq = 0 | |
165 tot_peakLR = 0 | |
166 tot_rsq = 0 | |
167 tot_rtsq = 0 | |
168 | |
169 # Create 2 dictionaries with gene info: gene_dict and gene_bin_dict | |
170 gene_positions = open(options.input3,'r') # gene_pos.txt | |
171 gene_dict = {} | |
172 gene_bin_dict = {} | |
173 for line in gene_positions: | |
174 l = line.strip().split("\t") | |
175 if l[0].startswith("gene") or l[0].startswith("Gene") or l[0].startswith("name") or l[0].startswith("Name"): | |
176 header = 1 # yes | |
177 else: | |
178 header = 0 # no | |
179 if header != 1: | |
180 # create a gene dictionary form gene positions file | |
181 gene_dict[l[0]] = l[1:]+[0,0,0,0] | |
182 chr_gene = l[1] | |
183 bp_gene = round((int(l[2])+int(l[3]))/2.0,0) | |
184 | |
185 # for each gene, get the correct bin from the lookup file | |
186 # first read lookup file into a list of lists (lookup_list), faster | |
187 gene_bin = "NA" | |
188 #print lookup_list | |
189 for lu in lookup_list: | |
190 if (lu[1]==chr_gene) and (bp_gene>=int(float(lu[5]))) and (float(lu[6])!=0): | |
191 gene_bin = lu[0] | |
192 gene_bin_dict[l[0]]=gene_bin | |
193 | |
194 gene_positions.close() | |
195 #print "Done 2" | |
196 eqtl_outfile = open(options.output1,'w') # classify.txt | |
197 eqtl_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tadditive\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n") | |
198 | |
199 max_peakLR = 0.0 | |
200 eqtl_get_maxLR = open(temp_file,'r') | |
201 for eqtlline in eqtl_get_maxLR: | |
202 l = eqtlline.strip().split("\t") | |
203 if l[1].startswith("i"): | |
204 header = 1 # yes | |
205 else: | |
206 header = 0 # no | |
207 if header != 1: | |
208 if ((l[9] != 'inf') and (float(l[9]) > max_peakLR)): | |
209 max_peakLR = float(l[9]) | |
210 eqtl_get_maxLR.close() | |
211 | |
212 count = 0 | |
213 position = 0 | |
214 status = 0 | |
215 x2 = "" | |
216 | |
217 eqtl = open(temp_file,'r') # eQTLs.txt | |
218 for eqtlline in eqtl: | |
219 classification = "" | |
220 l = eqtlline.strip().split("\t") | |
221 if l[1].startswith("i"): | |
222 header = 1 # yes | |
223 else: | |
224 header = 0 # no | |
225 if header != 1: | |
226 gene = l[0] | |
227 # for each eqtl, get the correct bin from the lookup file (lookup_list) | |
228 eqtl_bin = "NA" | |
229 for lu in lookup_list: | |
230 if (lu[1]==l[2]) and (float(l[8])>=float(lu[3])) and (float(lu[6])!=0): | |
231 eqtl_bin = lu[0] | |
232 try: | |
233 g = gene_dict[gene] | |
234 position = 1 | |
235 except: | |
236 # if the position of the gene is not known, then eQTL cannot be classified | |
237 classification = "no_result" | |
238 overlap_score = 0 | |
239 position = 0 | |
240 full_partial = "." | |
241 if position == 1: | |
242 gene_chr = g[0] | |
243 eqtl_chr = l[2] | |
244 # if the gene and eQTL on different chromosomes, then eQTL classified as trans | |
245 if gene_chr != eqtl_chr: | |
246 classification = "trans" | |
247 overlap_score = 0 | |
248 full_partial = "." | |
249 # if the gene and eQTL on the same chromosome, then test if cis / trans | |
250 else: | |
251 #print "else" | |
252 gene_bp = int(round((int(g[1])+int(g[2]))/float(2),0)) | |
253 prev_bp = 0 | |
254 prev_line = [0,0,0,0.0,0.0,0] | |
255 lookup = open(options.input4,'r') # lookup.txt | |
256 status = 0 | |
257 for line in lookup: | |
258 look = line.strip().split("\t") | |
259 if look[1] == gene_chr: | |
260 # for the gene position in bp (v1), calculate the best estimate cM position (x2) | |
261 # if gene is NOT to the left of the 1st marker (< 0 cM) or to the right of the last marker, x2 will be calculated and status will be 1 | |
262 | |
263 if (int(float(gene_bp)) > int(float(prev_bp))) and (int(float(gene_bp)) <= int(float(look[5]))): | |
264 # 2 : cM and 1 : bp | |
265 #print "gene_bp = "+str(gene_bp) | |
266 #print "prev_bp = "+str(prev_bp) | |
267 #print "look[5] = "+str(look[5]) | |
268 v1 = int(gene_bp) | |
269 s1 = int(prev_bp) | |
270 l1 = int(float(look[5])) | |
271 s2 = float(prev_line[4]) | |
272 l2 = float(look[4]) | |
273 x2 = round(((v1-s1)*(l2-s2)/(l1-s1))+s2,2) | |
274 status = 1 | |
275 #print "x2 = "+str(x2) | |
276 prev_bp = look[5] | |
277 prev_line = look | |
278 lookup.close() | |
279 #print "status = "+str(status)+"\n" | |
280 if status == 0: | |
281 #print "*********" * 100 | |
282 #print "gene at end of chr\n\n\n" | |
283 x2 = round(float(max_cM_dict[gene_chr]),2) | |
284 #print x2 | |
285 g_bin = "NA" | |
286 # if the gene is to the left of the 1st marker (< 0 cM) or to the right of the last marker; this could be improved (for the maize data it is fine) | |
287 #classification = "no_result" | |
288 #print classification | |
289 if classification != "if status == 0:": | |
290 # get the eQTL start and end cM positions (from lookup table) | |
291 #lookup = open(options.input4,'r') # lookup.txt | |
292 #count = 0 | |
293 #for line in lookup: | |
294 # count += 1 | |
295 # if count > 1: | |
296 # look = line.strip().split("\t") | |
297 # if int(look[1]) == int(eqtl_chr) and float(look[3]) == float(l[4]): | |
298 # eqtl_start = float(look[4]) | |
299 # if int(look[1]) == int(eqtl_chr) and float(look[3]) == float(l[6]): | |
300 # eqtl_end = float(look[4]) | |
301 #lookup.close() | |
302 # compare calculated gene cM position (x2) with eQTL start and end cM positions | |
303 # calculate a region around the gene (5 cM to either side) and test whether (i) eQTL start or (ii) eQTL end overlaps with that region; or (iii) whether the eQTL is entirely in that region | |
304 eqtl_start = float(l[4])*100 | |
305 eqtl_end = float(l[6])*100 | |
306 eqtl_peak = float(l[8])*100 | |
307 #cutoff = 5 | |
308 | |
309 if (int(gene_chr) == int(eqtl_chr)) and ((eqtl_start >= x2-float(cutoff) and eqtl_start <= x2+float(cutoff)) or (eqtl_end >= x2-float(cutoff) and eqtl_end <= x2+float(cutoff)) or (eqtl_start <= x2-float(cutoff) and eqtl_end >= x2+float(cutoff))): | |
310 classification = "cis" | |
311 | |
312 #if options.score_type == str(1): | |
313 # # calculate overlap score (dM score) | |
314 # start_end_list = [float(eqtl_start/10), float(eqtl_end/10), (x2-float(cutoff))/10, (x2+float(cutoff))/10] | |
315 # start_end_list.sort() | |
316 # | |
317 # ratio = ((start_end_list[2]-start_end_list[1])+1)/float((start_end_list[3]-start_end_list[0])+1) | |
318 # diff_peaks = float(abs((x2/10)-float(eqtl_peak/10)))+1 | |
319 # overlap_score = float(ratio/diff_peaks) | |
320 #else: | |
321 # # calculate overlap score (hybrid score: full overlap (both peaks in overlapping region) = M scale; partial overlap = dM scale) | |
322 # # Example of scale: 0.0801 M = 0.801 dM = 8.01 cM | |
323 | |
324 start_end_list = [float(eqtl_start/10),float(eqtl_end/10),(x2-float(cutoff))/10, (x2+float(cutoff))/10] | |
325 start_end_list.sort() | |
326 ######### test for full / partial overlap ######### | |
327 if ((float(eqtl_peak/10) >= start_end_list[1]) and (float(eqtl_peak/10) <= start_end_list[2])) \ | |
328 and ((x2/10 >= start_end_list[1]) and (x2/10 <= start_end_list[2])): | |
329 ratio = ((start_end_list[2]/10-start_end_list[1]/10)+1)/float((start_end_list[3]/10-start_end_list[0]/10)+1) | |
330 diff_peaks = float(abs(x2/100-float(eqtl_peak/100)))+1 | |
331 overlap_score = float(ratio/diff_peaks) | |
332 full_partial = "full" | |
333 | |
334 else: | |
335 # partial overlap --> use dM scale (multiply position by 10) --> already done | |
336 ratio = ((start_end_list[2]-start_end_list[1])+1)/float((start_end_list[3]-start_end_list[0])+1) | |
337 diff_peaks = float(abs(x2/10-float(eqtl_peak/10)))+1 | |
338 overlap_score = float(ratio/diff_peaks) | |
339 full_partial = "partial" | |
340 | |
341 else: | |
342 classification = "trans" | |
343 overlap_score = 0 | |
344 full_partial = "." | |
345 if classification == "cis": | |
346 cis += 1 | |
347 cis_peakLR += float(l[9]) | |
348 cis_rsq += float(l[10]) | |
349 cis_rtsq += float(l[11]) | |
350 g[4] += 1 | |
351 if classification == "trans": | |
352 trans += 1 | |
353 trans_peakLR += float(l[9]) | |
354 trans_rsq += float(l[10]) | |
355 trans_rtsq += float(l[11]) | |
356 g[5] += 1 | |
357 if classification == "no_result": | |
358 no_result += 1 | |
359 nr_peakLR += float(l[9]) | |
360 nr_rsq += float(l[10]) | |
361 nr_rtsq += float(l[11]) | |
362 g[6] += 1 | |
363 g[3] += 1 | |
364 total += 1 | |
365 tot_peakLR += float(l[9]) | |
366 tot_rsq += float(l[10]) | |
367 tot_rtsq += float(l[11]) | |
368 try: | |
369 g_bin = gene_bin_dict[l[0]] | |
370 except: | |
371 g_bin = "NA" | |
372 if status == 0: | |
373 #g_bin = "end c"+str(gene_chr) | |
374 g_bin = "NA" | |
375 gene_bin_dict[l[0]] = g_bin | |
376 if x2 == 0.0: | |
377 #g_bin = "start c"+str(gene_chr) | |
378 g_bin = "NA" | |
379 gene_bin_dict[l[0]] = g_bin | |
380 if l[9] != "inf": | |
381 eqtl_outfile.write(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\t"+str(overlap_score)+"\t"+str(full_partial)+"\n") | |
382 else: | |
383 #eqtl_outfile.write(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\n") | |
384 temp_line = "\t".join(l[0:9])+"\t"+str(max_peakLR)+"\t"+"\t".join(l[10:]) | |
385 eqtl_outfile.write(temp_line+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\t"+str(overlap_score)+"\t"+str(full_partial)+"\n") | |
386 #print(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)) | |
387 eqtl.close() | |
388 eqtl_outfile.close() | |
389 | |
390 # write cis and trans classification files | |
391 cis_chr = [] | |
392 trans_chr = [] | |
393 all_chr = [] | |
394 unknown_chr = [] | |
395 all_outfile = open(options.output1,'r') # all classify.txt | |
396 cis_outfile = open(options.output2,'w') # cis classify.txt | |
397 cis_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tparent_up_reg\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n") | |
398 trans_outfile = open(options.output3,'w') # trans classify.txt | |
399 trans_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tparent_up_reg\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n") | |
400 for allline in all_outfile: | |
401 all_chr.append(allline.strip().split("\t")[2]) | |
402 if allline.strip().split("\t")[13] == "cis": | |
403 cis_outfile.write(allline.strip()+"\n") | |
404 cis_chr.append(allline.strip().split("\t")[2]) | |
405 if allline.strip().split("\t")[13] == "trans": | |
406 trans_outfile.write(allline.strip()+"\n") | |
407 trans_chr.append(allline.strip().split("\t")[2]) | |
408 if allline.strip().split("\t")[13] == "no_result": | |
409 unknown_chr.append(allline.strip().split("\t")[2]) | |
410 cis_outfile.close() | |
411 trans_outfile.close() | |
412 all_outfile.close() | |
413 | |
414 # calcualte summary statistics | |
415 p_cis = round((cis/float(total))*100,2) | |
416 p_trans = round((trans/float(total))*100,2) | |
417 p_no_result = round((no_result/float(total))*100,2) | |
418 p_total = round((total/float(total))*100,2) | |
419 | |
420 mean_cis_peakLR = round(cis_peakLR/float(cis),1) | |
421 mean_cis_rsq = round(cis_rsq/float(cis),2) | |
422 mean_cis_rtsq = round(cis_rtsq/float(cis),2) | |
423 | |
424 mean_trans_peakLR = round(trans_peakLR/float(trans),1) | |
425 mean_trans_rsq = round(trans_rsq/float(trans),2) | |
426 mean_trans_rtsq = round(trans_rtsq/float(trans),2) | |
427 | |
428 if no_result != 0: | |
429 mean_nr_peakLR = round(nr_peakLR/float(no_result),1) | |
430 mean_nr_rsq = round(nr_rsq/float(no_result),2) | |
431 mean_nr_rtsq = round(nr_rtsq/float(no_result),2) | |
432 else: | |
433 mean_nr_peakLR = "-" | |
434 mean_nr_rsq = "-" | |
435 mean_nr_rtsq = "-" | |
436 | |
437 mean_tot_peakLR = round(tot_peakLR/float(total),1) | |
438 mean_tot_rsq = round(tot_rsq/float(total),2) | |
439 mean_tot_rtsq = round(tot_rtsq/float(total),2) | |
440 | |
441 cis_summary = "cis\t"+str(cis)+"\t"+ str(p_cis)+"%\t"+str(mean_cis_peakLR)+"\t"+str(mean_cis_rsq)+"\t"+str(mean_cis_rtsq)+"\n" | |
442 trans_summary = "trans\t"+str(trans)+"\t"+ str(p_trans)+"%\t"+str(mean_trans_peakLR)+"\t"+str(mean_trans_rsq)+"\t"+str(mean_trans_rtsq)+"\n" | |
443 no_result_summary = "unknown\t"+str(no_result)+"\t"+ str(p_no_result)+"%\t"+str(mean_nr_peakLR)+"\t"+str(mean_nr_rsq)+"\t"+str(mean_nr_rtsq)+"\n" | |
444 total_summary = "all\t"+str(total)+"\t"+ str(p_total)+"%\t"+str(mean_tot_peakLR)+"\t"+str(mean_tot_rsq)+"\t"+str(mean_tot_rtsq)+"\n" | |
445 | |
446 summary_outfile = open(options.output4,'w') # classify_summary.txt | |
447 summary_outfile.write("class\tnumber_eQTLs\tpercentage_eQTLs\taverage_peakLR\taverage_rsq\taverage_rtsq\n") | |
448 summary_outfile.write(cis_summary+trans_summary+no_result_summary+total_summary) | |
449 summary_outfile.close() | |
450 | |
451 | |
452 genes_chr = [] | |
453 for g in gene_dict.keys(): | |
454 genes_chr.append(gene_dict[g][0]) | |
455 | |
456 count_genes = {} | |
457 for i in set(genes_chr): | |
458 count_genes[i] = genes_chr.count(i) | |
459 count_cis = {} | |
460 for i in set(cis_chr): | |
461 count_cis[i] = cis_chr.count(i) | |
462 count_trans = {} | |
463 for i in set(trans_chr): | |
464 count_trans[i] = trans_chr.count(i) | |
465 count_all = {} | |
466 for i in set(all_chr): | |
467 count_all[i] = all_chr.count(i) | |
468 count_unknown = {} | |
469 for i in set(unknown_chr): | |
470 count_unknown[i] = unknown_chr.count(i) | |
471 | |
472 i = 0 | |
473 tot_genes = 0 | |
474 tot_cis = 0 | |
475 tot_trans = 0 | |
476 tot_unknown = 0 | |
477 tot_all = 0 | |
478 chr_summary_v2 = open(options.output5,'w') | |
479 chr_summary = open(options.input2,'r') | |
480 for line in chr_summary: | |
481 i += 1 | |
482 if i == 1: | |
483 chr_summary_v2.write(line.strip()+"\tgenes\tcis eQTL\ttrans eQT\tunknown eQTL\tall eQTL\n") | |
484 else: | |
485 l = line.strip().split("\t") | |
486 if l[0] != "Total": | |
487 try: | |
488 count_unknown_ans = count_unknown[l[0]] | |
489 tot_unknown += count_unknown[l[0]] | |
490 except: | |
491 count_unknown_ans = 0 | |
492 tot_unknown = 0 | |
493 chr_summary_v2.write(line.strip()+"\t"+str(count_genes[l[0]])+"\t"+str(count_cis[l[0]])+"\t"+str(count_trans[l[0]])+"\t"+str(count_unknown_ans)+"\t"+str(count_all[l[0]])+"\n") | |
494 tot_genes += count_genes[l[0]] | |
495 tot_cis += count_cis[l[0]] | |
496 tot_trans += count_trans[l[0]] | |
497 tot_all += count_all[l[0]] | |
498 else: | |
499 chr_summary_v2.write(line.strip()+"\t"+str(tot_genes)+"\t"+str(tot_cis)+"\t"+str(tot_trans)+"\t"+str(tot_unknown)+"\t"+str(tot_all)+"\n") | |
500 chr_summary_v2.close() | |
501 chr_summary.close() | |
502 | |
503 gene_pos_v2 = open(options.output6,'w') | |
504 gene_pos_v2.write("gene\tchr\tstart_bp\tend_bp\tnum_eQTL\tnum_cis_eQTL\tnum_trans_eQTL\tnum_unknown_eQTL\tgene_bin\n") | |
505 | |
506 eqtl_count = 0 | |
507 genes_with_eqtl = 0 | |
508 cis_count = 0 | |
509 genes_with_cis = 0 | |
510 trans_count = 0 | |
511 genes_with_trans = 0 | |
512 only_cis = 0 | |
513 only_trans = 0 | |
514 cis_and_trans = 0 | |
515 cis_or_trans = 0 | |
516 for g in gene_dict.keys(): | |
517 gene_pos_v2.write(g+"\t"+str(gene_dict[g][0])+"\t"+str(gene_dict[g][1])+"\t"+str(gene_dict[g][2])+"\t"+str(gene_dict[g][3])+"\t"+str(gene_dict[g][4])+"\t"+str(gene_dict[g][5])+"\t"+str(gene_dict[g][6])+"\t"+str(gene_bin_dict[g])+"\n") | |
518 # num all eQTL | |
519 if int(gene_dict[g][3]) != 0: | |
520 genes_with_eqtl += 1 | |
521 eqtl_count += int(gene_dict[g][3]) | |
522 # num cis eQTL | |
523 if int(gene_dict[g][4]) != 0: | |
524 genes_with_cis += 1 | |
525 cis_count += int(gene_dict[g][4]) | |
526 # num trans eQTL | |
527 if int(gene_dict[g][5]) != 0: | |
528 genes_with_trans += 1 | |
529 trans_count += int(gene_dict[g][5]) | |
530 if (int(gene_dict[g][4]) != 0) and (int(gene_dict[g][5]) == 0): | |
531 only_cis += 1 | |
532 if (int(gene_dict[g][5]) != 0) and (int(gene_dict[g][4]) == 0): | |
533 only_trans += 1 | |
534 if (int(gene_dict[g][5]) != 0) and (int(gene_dict[g][4]) != 0): | |
535 cis_and_trans += 1 | |
536 if (int(gene_dict[g][5]) != 0) or (int(gene_dict[g][4]) != 0): | |
537 cis_or_trans += 1 | |
538 | |
539 ave_num_eqtl_per_gene = eqtl_count/float(genes_with_eqtl) | |
540 ave_num_cis_eqtl_per_gene = cis_count/float(genes_with_cis) | |
541 ave_num_trans_eqtl_per_gene = trans_count/float(genes_with_trans) | |
542 #print "Average number of eQTLs per gene with eQTL\t"+str(round(ave_num_eqtl_per_gene,1)) | |
543 #print "Average number of cis eQTLs per gene with cis eQTL\t"+str(round(ave_num_cis_eqtl_per_gene,1)) | |
544 #print "Average number of trans eQTLs per gene with trans eQTL\t"+str(round(ave_num_trans_eqtl_per_gene,1)) | |
545 #print "Number of genes with only cis eQTL (no trans)\t"+str(only_cis) | |
546 #print "Number of genes with only trans eQTL (no cis)\t"+str(only_trans) | |
547 #print "Number of genes with cis and trans eQTL\t"+str(cis_and_trans) | |
548 #print "Number of genes with cis or trans eQTL\t"+str(cis_or_trans) | |
549 | |
550 eqtl_per_gene = open(options.output7,'w') | |
551 eqtl_per_gene.write("Average number of eQTLs per gene with eQTL\t"+str(round(ave_num_eqtl_per_gene,1))+"\n") | |
552 eqtl_per_gene.write("Average number of cis eQTLs per gene with cis eQTL\t"+str(round(ave_num_cis_eqtl_per_gene,1))+"\n") | |
553 eqtl_per_gene.write("Average number of trans eQTLs per gene with trans eQTL\t"+str(round(ave_num_trans_eqtl_per_gene,1))+"\n") | |
554 eqtl_per_gene.write("Number of genes with only cis eQTL (no trans)\t"+str(only_cis)+" ("+str(round((int(only_cis)/float(cis_or_trans))*100,1))+"%)\n") | |
555 eqtl_per_gene.write("Number of genes with only trans eQTL (no cis)\t"+str(only_trans)+" ("+str(round((int(only_trans)/float(cis_or_trans))*100,1))+"%)\n") | |
556 eqtl_per_gene.write("Number of genes with cis and trans eQTL\t"+str(cis_and_trans)+" ("+str(round((int(cis_and_trans)/float(cis_or_trans))*100,1))+"%)\n") | |
557 eqtl_per_gene.write("Number of genes with cis or trans eQTL\t"+str(cis_or_trans)+" ("+str(round((int(cis_and_trans)/float(cis_and_trans))*100,1))+"%)\n") | |
558 eqtl_per_gene.close() | |
559 | |
560 gene_pos_v2.close() | |
561 | |
562 # Create temp direcotry | |
563 tempdir = tempfile.mkdtemp() | |
564 | |
565 # copy INPUT file to the temp directory | |
566 s = "cp %s %s/all_classification.txt" %(options.output1, tempdir) | |
567 subprocess.call(s, shell=True) | |
568 s = "cp %s %s/lookup.txt" %(options.input4, tempdir) | |
569 subprocess.call(s, shell=True) | |
570 | |
571 # create R script => save in temp directory | |
572 # generate new header | |
573 new_script = open(tempdir+"/new_script.r","w") | |
574 header = "setwd(\"%s\")" %tempdir | |
575 new_script.write(header+"\n") | |
576 # add script body | |
577 script = open(options.rscript,"r") | |
578 for line in script: | |
579 new_script.write(line.strip()+"\n") | |
580 new_script.close() | |
581 | |
582 # run R script from temp directory | |
583 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir | |
584 subprocess.call(s, shell=True) | |
585 | |
586 # run R script from temp directory | |
587 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir | |
588 subprocess.call(s, shell=True) | |
589 | |
590 os.system("mv %s/Rplot_eQTL_genes_positions.pdf %s" %(tempdir,options.output8)) | |
591 | |
592 if __name__=="__main__": | |
593 __main__() | |
594 | |
595 | |
596 | |
597 |