comparison classifier/classifier.py @ 0:ef9c2044d86a draft

Uploaded
author nanettec
date Fri, 18 Mar 2016 05:15:29 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ef9c2044d86a
1 """
2 @summary: Classify eQTLs as cis or trans - calculate "hybrid" overlap score
3 @author: nanette.coetzer@gmail.com
4 @version 5
5
6 """
7 import optparse, sys
8 import tempfile
9 import os, re
10 import subprocess
11
12 def stop_err( msg ):
13 sys.stderr.write( "%s\n" % msg )
14 sys.exit()
15
16 def __main__():
17 #Parse Command Line
18 parser = optparse.OptionParser()
19 parser.add_option("-m", "--rscript", default=None, dest="rscript",
20 help="R script")
21 parser.add_option("-i", "--input1", default=None, dest="input1",
22 help="eQTLs file")
23 parser.add_option("-j", "--input2", default=None, dest="input2",
24 help="Chr summary file")
25 parser.add_option("-k", "--input3", default=None, dest="input3",
26 help="Gene positions file")
27 parser.add_option("-l", "--input4", default=None, dest="input4",
28 help="Lookup file")
29
30 #parser.add_option("-m", "--score_type", default=2, dest="score_type",
31 # help="Select dM scale (1) or hybrid (2)")
32
33 parser.add_option("-o", "--output1", default=None, dest="output1",
34 help="All classification file")
35 parser.add_option("-p", "--output2", default=None, dest="output2",
36 help="Cis classification file")
37 parser.add_option("-q", "--output3", default=None, dest="output3",
38 help="Trans classification file")
39 parser.add_option("-r", "--output4", default=None, dest="output4",
40 help="Classification summary file")
41 parser.add_option("-s", "--output5", default=None, dest="output5",
42 help="Chr summary file v2")
43 parser.add_option("-t", "--output6", default=None, dest="output6",
44 help="Gene positions file v2")
45 parser.add_option("-u", "--output7", default=None, dest="output7",
46 help="eQTLs per gene summary file")
47 parser.add_option("-v", "--output8", default=None, dest="output8",
48 help="eQTL vs gene position plot")
49 (options, args) = parser.parse_args()
50
51 try:
52 open(options.input1, "r").close()
53 except TypeError, e:
54 stop_err("You need to supply the eQTL file:\n" + str(e))
55 except IOError, e:
56 stop_err("Can not open the eQTL file:\n" + str(e))
57
58 try:
59 open(options.input2, "r").close()
60 except TypeError, e:
61 stop_err("You need to supply the Chr summary file:\n" + str(e))
62 except IOError, e:
63 stop_err("Can not open the Chr summary file:\n" + str(e))
64
65 try:
66 open(options.input3, "r").close()
67 except TypeError, e:
68 stop_err("You need to supply the Gene positions file:\n" + str(e))
69 except IOError, e:
70 stop_err("Can not open the Gene positions file:\n" + str(e))
71
72 try:
73 open(options.input4, "r").close()
74 except TypeError, e:
75 stop_err("You need to supply the Lookup file:\n" + str(e))
76 except IOError, e:
77 stop_err("Can not open the Lookup file:\n" + str(e))
78
79 ##############################################
80 lookup_list = []
81
82 # Convert eQTL input file, change marker number of last interval in each chromosome
83 check2 = ""
84 prev_m = "0"
85 prev_c = "1"
86 r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+")
87 indict = {}
88 max_cM_dict = {}
89 lookup = open(options.input4,'r')
90 for line in lookup:
91 l = line.strip().split("\t")
92 if l[6] == "0":
93 #print l
94 max_cM_dict[l[1]] = l[4]
95 lookup_list.append(l)
96 if l[0] != "id" and l[6] != "0":
97 c = l[1]
98 if prev_c != c:
99 indict[prev_c+"\t"+str(float(p))] = prev_c+"\t"+m+"\t"+str(float(p))
100 m = l[2]
101 p = l[3]
102 prev_c = c
103 #print max_cM_dict
104
105 #add last line of last chromosome
106 indict[prev_c+"\t"+p] = prev_c+"\t"+m+"\t"+p
107
108 #print indict
109
110 temp_file = tempfile.mktemp()
111 eqtl_temp_output = open(temp_file,'w')
112
113 eQTL_size = 0
114 eQTL_nr = 0
115 eqtl_in = open(options.input1,'r')
116 for line in eqtl_in:
117 l = line.strip().split("\t")
118 if l[3].startswith("c"):
119 header = 1 # yes
120 else:
121 header = 0 # no
122 if header != 1:
123 sfull = l[3]+"\t"+l[13]+"\t"+str(float(l[14]))
124 efull = l[3]+"\t"+l[15]+"\t"+str(float(l[16]))
125 pfull = l[3]+"\t"+l[4]+"\t"+str(float(l[5]))
126 sshort = l[3]+"\t"+str(float(l[14]))
127 eshort = l[3]+"\t"+str(float(l[16]))
128 pshort = l[3]+"\t"+str(float(l[5]))
129
130 if sshort in indict.keys():
131 sfull = indict[sshort]
132 if eshort in indict.keys():
133 efull = indict[eshort]
134 if pshort in indict.keys():
135 pfull = indict[pshort]
136 s = sfull.split("\t")
137 e = efull.split("\t")
138 p = pfull.split("\t")
139 eqtl_temp_output.write(l[0]+"\t"+l[1]+"\t"+s[0]+"\t"+s[1]+"\t"+s[2]+"\t"+e[1]+"\t"+e[2]+"\t"+p[1]+"\t"+p[2]+"\t"+l[6]+"\t"+l[8]+"\t"+l[9]+"\t"+l[11]+"\n")
140 #print l[0]+"\t"+l[1]+"\t"+s[0]+"\t"+s[1]+"\t"+s[2]+"\t"+e[1]+"\t"+e[2]+"\t"+p[1]+"\t"+p[2]+"\t"+l[6]+"\t"+l[8]+"\t"+l[9]+"\t"+l[11]+"\n"
141
142 eQTL_size += (float(l[16])*100)-(float(l[14])*100)
143 eQTL_nr += 1
144 lookup.close()
145 eqtl_in.close()
146 eqtl_temp_output.close()
147
148 cutoff = (eQTL_size/float(eQTL_nr))/2
149 #print "cutoff: ",cutoff
150 #############################################
151
152 cis = 0
153 trans = 0
154 no_result = 0
155 total = 0
156 cis_peakLR = 0
157 cis_rsq = 0
158 cis_rtsq = 0
159 trans_peakLR = 0
160 trans_rsq = 0
161 trans_rtsq = 0
162 nr_peakLR = 0
163 nr_rsq = 0
164 nr_rtsq = 0
165 tot_peakLR = 0
166 tot_rsq = 0
167 tot_rtsq = 0
168
169 # Create 2 dictionaries with gene info: gene_dict and gene_bin_dict
170 gene_positions = open(options.input3,'r') # gene_pos.txt
171 gene_dict = {}
172 gene_bin_dict = {}
173 for line in gene_positions:
174 l = line.strip().split("\t")
175 if l[0].startswith("gene") or l[0].startswith("Gene") or l[0].startswith("name") or l[0].startswith("Name"):
176 header = 1 # yes
177 else:
178 header = 0 # no
179 if header != 1:
180 # create a gene dictionary form gene positions file
181 gene_dict[l[0]] = l[1:]+[0,0,0,0]
182 chr_gene = l[1]
183 bp_gene = round((int(l[2])+int(l[3]))/2.0,0)
184
185 # for each gene, get the correct bin from the lookup file
186 # first read lookup file into a list of lists (lookup_list), faster
187 gene_bin = "NA"
188 #print lookup_list
189 for lu in lookup_list:
190 if (lu[1]==chr_gene) and (bp_gene>=int(float(lu[5]))) and (float(lu[6])!=0):
191 gene_bin = lu[0]
192 gene_bin_dict[l[0]]=gene_bin
193
194 gene_positions.close()
195 #print "Done 2"
196 eqtl_outfile = open(options.output1,'w') # classify.txt
197 eqtl_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tadditive\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
198
199 max_peakLR = 0.0
200 eqtl_get_maxLR = open(temp_file,'r')
201 for eqtlline in eqtl_get_maxLR:
202 l = eqtlline.strip().split("\t")
203 if l[1].startswith("i"):
204 header = 1 # yes
205 else:
206 header = 0 # no
207 if header != 1:
208 if ((l[9] != 'inf') and (float(l[9]) > max_peakLR)):
209 max_peakLR = float(l[9])
210 eqtl_get_maxLR.close()
211
212 count = 0
213 position = 0
214 status = 0
215 x2 = ""
216
217 eqtl = open(temp_file,'r') # eQTLs.txt
218 for eqtlline in eqtl:
219 classification = ""
220 l = eqtlline.strip().split("\t")
221 if l[1].startswith("i"):
222 header = 1 # yes
223 else:
224 header = 0 # no
225 if header != 1:
226 gene = l[0]
227 # for each eqtl, get the correct bin from the lookup file (lookup_list)
228 eqtl_bin = "NA"
229 for lu in lookup_list:
230 if (lu[1]==l[2]) and (float(l[8])>=float(lu[3])) and (float(lu[6])!=0):
231 eqtl_bin = lu[0]
232 try:
233 g = gene_dict[gene]
234 position = 1
235 except:
236 # if the position of the gene is not known, then eQTL cannot be classified
237 classification = "no_result"
238 overlap_score = 0
239 position = 0
240 full_partial = "."
241 if position == 1:
242 gene_chr = g[0]
243 eqtl_chr = l[2]
244 # if the gene and eQTL on different chromosomes, then eQTL classified as trans
245 if gene_chr != eqtl_chr:
246 classification = "trans"
247 overlap_score = 0
248 full_partial = "."
249 # if the gene and eQTL on the same chromosome, then test if cis / trans
250 else:
251 #print "else"
252 gene_bp = int(round((int(g[1])+int(g[2]))/float(2),0))
253 prev_bp = 0
254 prev_line = [0,0,0,0.0,0.0,0]
255 lookup = open(options.input4,'r') # lookup.txt
256 status = 0
257 for line in lookup:
258 look = line.strip().split("\t")
259 if look[1] == gene_chr:
260 # for the gene position in bp (v1), calculate the best estimate cM position (x2)
261 # if gene is NOT to the left of the 1st marker (< 0 cM) or to the right of the last marker, x2 will be calculated and status will be 1
262
263 if (int(float(gene_bp)) > int(float(prev_bp))) and (int(float(gene_bp)) <= int(float(look[5]))):
264 # 2 : cM and 1 : bp
265 #print "gene_bp = "+str(gene_bp)
266 #print "prev_bp = "+str(prev_bp)
267 #print "look[5] = "+str(look[5])
268 v1 = int(gene_bp)
269 s1 = int(prev_bp)
270 l1 = int(float(look[5]))
271 s2 = float(prev_line[4])
272 l2 = float(look[4])
273 x2 = round(((v1-s1)*(l2-s2)/(l1-s1))+s2,2)
274 status = 1
275 #print "x2 = "+str(x2)
276 prev_bp = look[5]
277 prev_line = look
278 lookup.close()
279 #print "status = "+str(status)+"\n"
280 if status == 0:
281 #print "*********" * 100
282 #print "gene at end of chr\n\n\n"
283 x2 = round(float(max_cM_dict[gene_chr]),2)
284 #print x2
285 g_bin = "NA"
286 # if the gene is to the left of the 1st marker (< 0 cM) or to the right of the last marker; this could be improved (for the maize data it is fine)
287 #classification = "no_result"
288 #print classification
289 if classification != "if status == 0:":
290 # get the eQTL start and end cM positions (from lookup table)
291 #lookup = open(options.input4,'r') # lookup.txt
292 #count = 0
293 #for line in lookup:
294 # count += 1
295 # if count > 1:
296 # look = line.strip().split("\t")
297 # if int(look[1]) == int(eqtl_chr) and float(look[3]) == float(l[4]):
298 # eqtl_start = float(look[4])
299 # if int(look[1]) == int(eqtl_chr) and float(look[3]) == float(l[6]):
300 # eqtl_end = float(look[4])
301 #lookup.close()
302 # compare calculated gene cM position (x2) with eQTL start and end cM positions
303 # calculate a region around the gene (5 cM to either side) and test whether (i) eQTL start or (ii) eQTL end overlaps with that region; or (iii) whether the eQTL is entirely in that region
304 eqtl_start = float(l[4])*100
305 eqtl_end = float(l[6])*100
306 eqtl_peak = float(l[8])*100
307 #cutoff = 5
308
309 if (int(gene_chr) == int(eqtl_chr)) and ((eqtl_start >= x2-float(cutoff) and eqtl_start <= x2+float(cutoff)) or (eqtl_end >= x2-float(cutoff) and eqtl_end <= x2+float(cutoff)) or (eqtl_start <= x2-float(cutoff) and eqtl_end >= x2+float(cutoff))):
310 classification = "cis"
311
312 #if options.score_type == str(1):
313 # # calculate overlap score (dM score)
314 # start_end_list = [float(eqtl_start/10), float(eqtl_end/10), (x2-float(cutoff))/10, (x2+float(cutoff))/10]
315 # start_end_list.sort()
316 #
317 # ratio = ((start_end_list[2]-start_end_list[1])+1)/float((start_end_list[3]-start_end_list[0])+1)
318 # diff_peaks = float(abs((x2/10)-float(eqtl_peak/10)))+1
319 # overlap_score = float(ratio/diff_peaks)
320 #else:
321 # # calculate overlap score (hybrid score: full overlap (both peaks in overlapping region) = M scale; partial overlap = dM scale)
322 # # Example of scale: 0.0801 M = 0.801 dM = 8.01 cM
323
324 start_end_list = [float(eqtl_start/10),float(eqtl_end/10),(x2-float(cutoff))/10, (x2+float(cutoff))/10]
325 start_end_list.sort()
326 ######### test for full / partial overlap #########
327 if ((float(eqtl_peak/10) >= start_end_list[1]) and (float(eqtl_peak/10) <= start_end_list[2])) \
328 and ((x2/10 >= start_end_list[1]) and (x2/10 <= start_end_list[2])):
329 ratio = ((start_end_list[2]/10-start_end_list[1]/10)+1)/float((start_end_list[3]/10-start_end_list[0]/10)+1)
330 diff_peaks = float(abs(x2/100-float(eqtl_peak/100)))+1
331 overlap_score = float(ratio/diff_peaks)
332 full_partial = "full"
333
334 else:
335 # partial overlap --> use dM scale (multiply position by 10) --> already done
336 ratio = ((start_end_list[2]-start_end_list[1])+1)/float((start_end_list[3]-start_end_list[0])+1)
337 diff_peaks = float(abs(x2/10-float(eqtl_peak/10)))+1
338 overlap_score = float(ratio/diff_peaks)
339 full_partial = "partial"
340
341 else:
342 classification = "trans"
343 overlap_score = 0
344 full_partial = "."
345 if classification == "cis":
346 cis += 1
347 cis_peakLR += float(l[9])
348 cis_rsq += float(l[10])
349 cis_rtsq += float(l[11])
350 g[4] += 1
351 if classification == "trans":
352 trans += 1
353 trans_peakLR += float(l[9])
354 trans_rsq += float(l[10])
355 trans_rtsq += float(l[11])
356 g[5] += 1
357 if classification == "no_result":
358 no_result += 1
359 nr_peakLR += float(l[9])
360 nr_rsq += float(l[10])
361 nr_rtsq += float(l[11])
362 g[6] += 1
363 g[3] += 1
364 total += 1
365 tot_peakLR += float(l[9])
366 tot_rsq += float(l[10])
367 tot_rtsq += float(l[11])
368 try:
369 g_bin = gene_bin_dict[l[0]]
370 except:
371 g_bin = "NA"
372 if status == 0:
373 #g_bin = "end c"+str(gene_chr)
374 g_bin = "NA"
375 gene_bin_dict[l[0]] = g_bin
376 if x2 == 0.0:
377 #g_bin = "start c"+str(gene_chr)
378 g_bin = "NA"
379 gene_bin_dict[l[0]] = g_bin
380 if l[9] != "inf":
381 eqtl_outfile.write(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\t"+str(overlap_score)+"\t"+str(full_partial)+"\n")
382 else:
383 #eqtl_outfile.write(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\n")
384 temp_line = "\t".join(l[0:9])+"\t"+str(max_peakLR)+"\t"+"\t".join(l[10:])
385 eqtl_outfile.write(temp_line+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\t"+str(overlap_score)+"\t"+str(full_partial)+"\n")
386 #print(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin))
387 eqtl.close()
388 eqtl_outfile.close()
389
390 # write cis and trans classification files
391 cis_chr = []
392 trans_chr = []
393 all_chr = []
394 unknown_chr = []
395 all_outfile = open(options.output1,'r') # all classify.txt
396 cis_outfile = open(options.output2,'w') # cis classify.txt
397 cis_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tparent_up_reg\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
398 trans_outfile = open(options.output3,'w') # trans classify.txt
399 trans_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tparent_up_reg\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
400 for allline in all_outfile:
401 all_chr.append(allline.strip().split("\t")[2])
402 if allline.strip().split("\t")[13] == "cis":
403 cis_outfile.write(allline.strip()+"\n")
404 cis_chr.append(allline.strip().split("\t")[2])
405 if allline.strip().split("\t")[13] == "trans":
406 trans_outfile.write(allline.strip()+"\n")
407 trans_chr.append(allline.strip().split("\t")[2])
408 if allline.strip().split("\t")[13] == "no_result":
409 unknown_chr.append(allline.strip().split("\t")[2])
410 cis_outfile.close()
411 trans_outfile.close()
412 all_outfile.close()
413
414 # calcualte summary statistics
415 p_cis = round((cis/float(total))*100,2)
416 p_trans = round((trans/float(total))*100,2)
417 p_no_result = round((no_result/float(total))*100,2)
418 p_total = round((total/float(total))*100,2)
419
420 mean_cis_peakLR = round(cis_peakLR/float(cis),1)
421 mean_cis_rsq = round(cis_rsq/float(cis),2)
422 mean_cis_rtsq = round(cis_rtsq/float(cis),2)
423
424 mean_trans_peakLR = round(trans_peakLR/float(trans),1)
425 mean_trans_rsq = round(trans_rsq/float(trans),2)
426 mean_trans_rtsq = round(trans_rtsq/float(trans),2)
427
428 if no_result != 0:
429 mean_nr_peakLR = round(nr_peakLR/float(no_result),1)
430 mean_nr_rsq = round(nr_rsq/float(no_result),2)
431 mean_nr_rtsq = round(nr_rtsq/float(no_result),2)
432 else:
433 mean_nr_peakLR = "-"
434 mean_nr_rsq = "-"
435 mean_nr_rtsq = "-"
436
437 mean_tot_peakLR = round(tot_peakLR/float(total),1)
438 mean_tot_rsq = round(tot_rsq/float(total),2)
439 mean_tot_rtsq = round(tot_rtsq/float(total),2)
440
441 cis_summary = "cis\t"+str(cis)+"\t"+ str(p_cis)+"%\t"+str(mean_cis_peakLR)+"\t"+str(mean_cis_rsq)+"\t"+str(mean_cis_rtsq)+"\n"
442 trans_summary = "trans\t"+str(trans)+"\t"+ str(p_trans)+"%\t"+str(mean_trans_peakLR)+"\t"+str(mean_trans_rsq)+"\t"+str(mean_trans_rtsq)+"\n"
443 no_result_summary = "unknown\t"+str(no_result)+"\t"+ str(p_no_result)+"%\t"+str(mean_nr_peakLR)+"\t"+str(mean_nr_rsq)+"\t"+str(mean_nr_rtsq)+"\n"
444 total_summary = "all\t"+str(total)+"\t"+ str(p_total)+"%\t"+str(mean_tot_peakLR)+"\t"+str(mean_tot_rsq)+"\t"+str(mean_tot_rtsq)+"\n"
445
446 summary_outfile = open(options.output4,'w') # classify_summary.txt
447 summary_outfile.write("class\tnumber_eQTLs\tpercentage_eQTLs\taverage_peakLR\taverage_rsq\taverage_rtsq\n")
448 summary_outfile.write(cis_summary+trans_summary+no_result_summary+total_summary)
449 summary_outfile.close()
450
451
452 genes_chr = []
453 for g in gene_dict.keys():
454 genes_chr.append(gene_dict[g][0])
455
456 count_genes = {}
457 for i in set(genes_chr):
458 count_genes[i] = genes_chr.count(i)
459 count_cis = {}
460 for i in set(cis_chr):
461 count_cis[i] = cis_chr.count(i)
462 count_trans = {}
463 for i in set(trans_chr):
464 count_trans[i] = trans_chr.count(i)
465 count_all = {}
466 for i in set(all_chr):
467 count_all[i] = all_chr.count(i)
468 count_unknown = {}
469 for i in set(unknown_chr):
470 count_unknown[i] = unknown_chr.count(i)
471
472 i = 0
473 tot_genes = 0
474 tot_cis = 0
475 tot_trans = 0
476 tot_unknown = 0
477 tot_all = 0
478 chr_summary_v2 = open(options.output5,'w')
479 chr_summary = open(options.input2,'r')
480 for line in chr_summary:
481 i += 1
482 if i == 1:
483 chr_summary_v2.write(line.strip()+"\tgenes\tcis eQTL\ttrans eQT\tunknown eQTL\tall eQTL\n")
484 else:
485 l = line.strip().split("\t")
486 if l[0] != "Total":
487 try:
488 count_unknown_ans = count_unknown[l[0]]
489 tot_unknown += count_unknown[l[0]]
490 except:
491 count_unknown_ans = 0
492 tot_unknown = 0
493 chr_summary_v2.write(line.strip()+"\t"+str(count_genes[l[0]])+"\t"+str(count_cis[l[0]])+"\t"+str(count_trans[l[0]])+"\t"+str(count_unknown_ans)+"\t"+str(count_all[l[0]])+"\n")
494 tot_genes += count_genes[l[0]]
495 tot_cis += count_cis[l[0]]
496 tot_trans += count_trans[l[0]]
497 tot_all += count_all[l[0]]
498 else:
499 chr_summary_v2.write(line.strip()+"\t"+str(tot_genes)+"\t"+str(tot_cis)+"\t"+str(tot_trans)+"\t"+str(tot_unknown)+"\t"+str(tot_all)+"\n")
500 chr_summary_v2.close()
501 chr_summary.close()
502
503 gene_pos_v2 = open(options.output6,'w')
504 gene_pos_v2.write("gene\tchr\tstart_bp\tend_bp\tnum_eQTL\tnum_cis_eQTL\tnum_trans_eQTL\tnum_unknown_eQTL\tgene_bin\n")
505
506 eqtl_count = 0
507 genes_with_eqtl = 0
508 cis_count = 0
509 genes_with_cis = 0
510 trans_count = 0
511 genes_with_trans = 0
512 only_cis = 0
513 only_trans = 0
514 cis_and_trans = 0
515 cis_or_trans = 0
516 for g in gene_dict.keys():
517 gene_pos_v2.write(g+"\t"+str(gene_dict[g][0])+"\t"+str(gene_dict[g][1])+"\t"+str(gene_dict[g][2])+"\t"+str(gene_dict[g][3])+"\t"+str(gene_dict[g][4])+"\t"+str(gene_dict[g][5])+"\t"+str(gene_dict[g][6])+"\t"+str(gene_bin_dict[g])+"\n")
518 # num all eQTL
519 if int(gene_dict[g][3]) != 0:
520 genes_with_eqtl += 1
521 eqtl_count += int(gene_dict[g][3])
522 # num cis eQTL
523 if int(gene_dict[g][4]) != 0:
524 genes_with_cis += 1
525 cis_count += int(gene_dict[g][4])
526 # num trans eQTL
527 if int(gene_dict[g][5]) != 0:
528 genes_with_trans += 1
529 trans_count += int(gene_dict[g][5])
530 if (int(gene_dict[g][4]) != 0) and (int(gene_dict[g][5]) == 0):
531 only_cis += 1
532 if (int(gene_dict[g][5]) != 0) and (int(gene_dict[g][4]) == 0):
533 only_trans += 1
534 if (int(gene_dict[g][5]) != 0) and (int(gene_dict[g][4]) != 0):
535 cis_and_trans += 1
536 if (int(gene_dict[g][5]) != 0) or (int(gene_dict[g][4]) != 0):
537 cis_or_trans += 1
538
539 ave_num_eqtl_per_gene = eqtl_count/float(genes_with_eqtl)
540 ave_num_cis_eqtl_per_gene = cis_count/float(genes_with_cis)
541 ave_num_trans_eqtl_per_gene = trans_count/float(genes_with_trans)
542 #print "Average number of eQTLs per gene with eQTL\t"+str(round(ave_num_eqtl_per_gene,1))
543 #print "Average number of cis eQTLs per gene with cis eQTL\t"+str(round(ave_num_cis_eqtl_per_gene,1))
544 #print "Average number of trans eQTLs per gene with trans eQTL\t"+str(round(ave_num_trans_eqtl_per_gene,1))
545 #print "Number of genes with only cis eQTL (no trans)\t"+str(only_cis)
546 #print "Number of genes with only trans eQTL (no cis)\t"+str(only_trans)
547 #print "Number of genes with cis and trans eQTL\t"+str(cis_and_trans)
548 #print "Number of genes with cis or trans eQTL\t"+str(cis_or_trans)
549
550 eqtl_per_gene = open(options.output7,'w')
551 eqtl_per_gene.write("Average number of eQTLs per gene with eQTL\t"+str(round(ave_num_eqtl_per_gene,1))+"\n")
552 eqtl_per_gene.write("Average number of cis eQTLs per gene with cis eQTL\t"+str(round(ave_num_cis_eqtl_per_gene,1))+"\n")
553 eqtl_per_gene.write("Average number of trans eQTLs per gene with trans eQTL\t"+str(round(ave_num_trans_eqtl_per_gene,1))+"\n")
554 eqtl_per_gene.write("Number of genes with only cis eQTL (no trans)\t"+str(only_cis)+" ("+str(round((int(only_cis)/float(cis_or_trans))*100,1))+"%)\n")
555 eqtl_per_gene.write("Number of genes with only trans eQTL (no cis)\t"+str(only_trans)+" ("+str(round((int(only_trans)/float(cis_or_trans))*100,1))+"%)\n")
556 eqtl_per_gene.write("Number of genes with cis and trans eQTL\t"+str(cis_and_trans)+" ("+str(round((int(cis_and_trans)/float(cis_or_trans))*100,1))+"%)\n")
557 eqtl_per_gene.write("Number of genes with cis or trans eQTL\t"+str(cis_or_trans)+" ("+str(round((int(cis_and_trans)/float(cis_and_trans))*100,1))+"%)\n")
558 eqtl_per_gene.close()
559
560 gene_pos_v2.close()
561
562 # Create temp direcotry
563 tempdir = tempfile.mkdtemp()
564
565 # copy INPUT file to the temp directory
566 s = "cp %s %s/all_classification.txt" %(options.output1, tempdir)
567 subprocess.call(s, shell=True)
568 s = "cp %s %s/lookup.txt" %(options.input4, tempdir)
569 subprocess.call(s, shell=True)
570
571 # create R script => save in temp directory
572 # generate new header
573 new_script = open(tempdir+"/new_script.r","w")
574 header = "setwd(\"%s\")" %tempdir
575 new_script.write(header+"\n")
576 # add script body
577 script = open(options.rscript,"r")
578 for line in script:
579 new_script.write(line.strip()+"\n")
580 new_script.close()
581
582 # run R script from temp directory
583 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
584 subprocess.call(s, shell=True)
585
586 # run R script from temp directory
587 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
588 subprocess.call(s, shell=True)
589
590 os.system("mv %s/Rplot_eQTL_genes_positions.pdf %s" %(tempdir,options.output8))
591
592 if __name__=="__main__":
593 __main__()
594
595
596
597