classifier: classifier/classifier.py comparison

comparison classifier/classifier.py @ 0:ef9c2044d86a draft

Uploaded

author	nanettec
date	Fri, 18 Mar 2016 05:15:29 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:ef9c2044d86a
+"""
+@summary: Classify eQTLs as cis or trans - calculate "hybrid" overlap score
+@author: nanette.coetzer@gmail.com
+@version 5
+"""
+import optparse, sys
+import tempfile
+import os, re
+import subprocess
+def stop_err( msg ):
+sys.stderr.write( "%s\n" % msg )
+sys.exit()
+def __main__():
+#Parse Command Line
+parser = optparse.OptionParser()
+parser.add_option("-m", "--rscript", default=None, dest="rscript",
+help="R script")
+parser.add_option("-i", "--input1", default=None, dest="input1",
+help="eQTLs file")
+parser.add_option("-j", "--input2", default=None, dest="input2",
+help="Chr summary file")
+parser.add_option("-k", "--input3", default=None, dest="input3",
+help="Gene positions file")
+parser.add_option("-l", "--input4", default=None, dest="input4",
+help="Lookup file")
+#parser.add_option("-m", "--score_type", default=2, dest="score_type",
+#                  help="Select dM scale (1) or hybrid (2)")
+parser.add_option("-o", "--output1", default=None, dest="output1",
+help="All classification file")
+parser.add_option("-p", "--output2", default=None, dest="output2",
+help="Cis classification file")
+parser.add_option("-q", "--output3", default=None, dest="output3",
+help="Trans classification file")
+parser.add_option("-r", "--output4", default=None, dest="output4",
+help="Classification summary file")
+parser.add_option("-s", "--output5", default=None, dest="output5",
+help="Chr summary file v2")
+parser.add_option("-t", "--output6", default=None, dest="output6",
+help="Gene positions file v2")
+parser.add_option("-u", "--output7", default=None, dest="output7",
+help="eQTLs per gene summary file")
+parser.add_option("-v", "--output8", default=None, dest="output8",
+help="eQTL vs gene position plot")
+(options, args) = parser.parse_args()
+try:
+open(options.input1, "r").close()
+except TypeError, e:
+stop_err("You need to supply the eQTL file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the eQTL file:\n" + str(e))
+try:
+open(options.input2, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Chr summary file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Chr summary file:\n" + str(e))
+try:
+open(options.input3, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Gene positions file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Gene positions file:\n" + str(e))
+try:
+open(options.input4, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Lookup file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Lookup file:\n" + str(e))
+##############################################
+lookup_list = []
+# Convert eQTL input file, change marker number of last interval in each chromosome
+check2 = ""
+prev_m = "0"
+prev_c = "1"
+r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+")
+indict = {}
+max_cM_dict = {}
+lookup = open(options.input4,'r')
+for line in lookup:
+l = line.strip().split("\t")
+if l[6] == "0":
+#print l
+max_cM_dict[l[1]] = l[4]
+lookup_list.append(l)
+if l[0] != "id" and l[6] != "0":
+c = l[1]
+if prev_c != c:
+indict[prev_c+"\t"+str(float(p))] = prev_c+"\t"+m+"\t"+str(float(p))
+m = l[2]
+p = l[3]
+prev_c = c
+#print max_cM_dict
+#add last line of last chromosome
+indict[prev_c+"\t"+p] = prev_c+"\t"+m+"\t"+p
+#print indict
+temp_file = tempfile.mktemp()
+eqtl_temp_output = open(temp_file,'w')
+eQTL_size = 0
+eQTL_nr = 0
+eqtl_in = open(options.input1,'r')
+for line in eqtl_in:
+l = line.strip().split("\t")
+if l[3].startswith("c"):
+header = 1  # yes
+else:
+header = 0  # no
+if header != 1:
+sfull = l[3]+"\t"+l[13]+"\t"+str(float(l[14]))
+efull = l[3]+"\t"+l[15]+"\t"+str(float(l[16]))
+pfull = l[3]+"\t"+l[4]+"\t"+str(float(l[5]))
+sshort = l[3]+"\t"+str(float(l[14]))
+eshort = l[3]+"\t"+str(float(l[16]))
+pshort = l[3]+"\t"+str(float(l[5]))
+if sshort in indict.keys():
+sfull = indict[sshort]
+if eshort in indict.keys():
+efull = indict[eshort]
+if pshort in indict.keys():
+pfull = indict[pshort]
+s = sfull.split("\t")
+e = efull.split("\t")
+p = pfull.split("\t")
+eqtl_temp_output.write(l[0]+"\t"+l[1]+"\t"+s[0]+"\t"+s[1]+"\t"+s[2]+"\t"+e[1]+"\t"+e[2]+"\t"+p[1]+"\t"+p[2]+"\t"+l[6]+"\t"+l[8]+"\t"+l[9]+"\t"+l[11]+"\n")
+#print l[0]+"\t"+l[1]+"\t"+s[0]+"\t"+s[1]+"\t"+s[2]+"\t"+e[1]+"\t"+e[2]+"\t"+p[1]+"\t"+p[2]+"\t"+l[6]+"\t"+l[8]+"\t"+l[9]+"\t"+l[11]+"\n"
+eQTL_size += (float(l[16])*100)-(float(l[14])*100)
+eQTL_nr += 1
+lookup.close()
+eqtl_in.close()
+eqtl_temp_output.close()
+cutoff = (eQTL_size/float(eQTL_nr))/2
+#print "cutoff: ",cutoff
+#############################################
+cis = 0
+trans = 0
+no_result = 0
+total = 0
+cis_peakLR = 0
+cis_rsq = 0
+cis_rtsq = 0
+trans_peakLR = 0
+trans_rsq = 0
+trans_rtsq = 0
+nr_peakLR = 0
+nr_rsq = 0
+nr_rtsq = 0
+tot_peakLR = 0
+tot_rsq = 0
+tot_rtsq = 0
+# Create 2 dictionaries with gene info: gene_dict and gene_bin_dict
+gene_positions = open(options.input3,'r')  # gene_pos.txt
+gene_dict = {}
+gene_bin_dict = {}
+for line in gene_positions:
+l = line.strip().split("\t")
+if l[0].startswith("gene") or l[0].startswith("Gene") or l[0].startswith("name") or l[0].startswith("Name"):
+header = 1  # yes
+else:
+header = 0  # no
+if header != 1:
+# create a gene dictionary form gene positions file
+gene_dict[l[0]] = l[1:]+[0,0,0,0]
+chr_gene = l[1]
+bp_gene = round((int(l[2])+int(l[3]))/2.0,0)
+# for each gene, get the correct bin from the lookup file
+# first read lookup file into a list of lists (lookup_list), faster
+gene_bin = "NA"
+#print lookup_list
+for lu in lookup_list:
+if (lu[1]==chr_gene) and (bp_gene>=int(float(lu[5]))) and (float(lu[6])!=0):
+gene_bin = lu[0]
+gene_bin_dict[l[0]]=gene_bin
+gene_positions.close()
+#print "Done 2"
+eqtl_outfile = open(options.output1,'w')    # classify.txt
+eqtl_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tadditive\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
+max_peakLR = 0.0
+eqtl_get_maxLR = open(temp_file,'r')
+for eqtlline in eqtl_get_maxLR:
+l = eqtlline.strip().split("\t")
+if l[1].startswith("i"):
+header = 1  # yes
+else:
+header = 0  # no
+if header != 1:
+if ((l[9] != 'inf') and (float(l[9]) > max_peakLR)):
+max_peakLR = float(l[9])
+eqtl_get_maxLR.close()
+count = 0
+position = 0
+status = 0
+x2 = ""
+eqtl = open(temp_file,'r')  # eQTLs.txt
+for eqtlline in eqtl:
+classification = ""
+l = eqtlline.strip().split("\t")
+if l[1].startswith("i"):
+header = 1  # yes
+else:
+header = 0  # no
+if header != 1:
+gene = l[0]
+# for each eqtl, get the correct bin from the lookup file (lookup_list)
+eqtl_bin = "NA"
+for lu in lookup_list:
+if (lu[1]==l[2]) and (float(l[8])>=float(lu[3])) and (float(lu[6])!=0):
+eqtl_bin = lu[0]
+try:
+g = gene_dict[gene]
+position = 1
+except:
+# if the position of the gene is not known, then eQTL cannot be classified
+classification = "no_result"
+overlap_score = 0
+position = 0
+full_partial = "."
+if position == 1:
+gene_chr = g[0]
+eqtl_chr = l[2]
+# if the gene and eQTL on different chromosomes, then eQTL classified as trans
+if gene_chr != eqtl_chr:
+classification = "trans"
+overlap_score = 0
+full_partial = "."
+# if the gene and eQTL on the same chromosome, then test if cis / trans
+else:
+#print "else"
+gene_bp = int(round((int(g[1])+int(g[2]))/float(2),0))
+prev_bp = 0
+prev_line = [0,0,0,0.0,0.0,0]
+lookup = open(options.input4,'r')  # lookup.txt
+status = 0
+for line in lookup:
+look = line.strip().split("\t")
+if look[1] == gene_chr:
+# for the gene position in bp (v1), calculate the best estimate cM position (x2)
+# if gene is NOT to the left of the 1st marker (< 0 cM) or to the right of the last marker, x2 will be calculated and status will be 1
+if (int(float(gene_bp)) > int(float(prev_bp))) and (int(float(gene_bp)) <= int(float(look[5]))):
+# 2 : cM and 1 : bp
+#print "gene_bp = "+str(gene_bp)
+#print "prev_bp = "+str(prev_bp)
+#print "look[5] = "+str(look[5])
+v1 = int(gene_bp)
+s1 = int(prev_bp)
+l1 = int(float(look[5]))
+s2 = float(prev_line[4])
+l2 = float(look[4])
+x2 = round(((v1-s1)*(l2-s2)/(l1-s1))+s2,2)
+status = 1
+#print "x2 = "+str(x2)
+prev_bp = look[5]
+prev_line = look
+lookup.close()
+#print "status = "+str(status)+"\n"
+if status == 0:
+#print "*********" * 100
+#print "gene at end of chr\n\n\n"
+x2 = round(float(max_cM_dict[gene_chr]),2)
+#print x2
+g_bin = "NA"
+# if the gene is to the left of the 1st marker (< 0 cM) or to the right of the last marker; this could be improved (for the maize data it is fine)
+#classification = "no_result"
+#print classification
+if classification != "if status == 0:":
+# get the eQTL start and end cM positions (from lookup table)
+#lookup = open(options.input4,'r')  # lookup.txt
+#count = 0
+#for line in lookup:
+#    count += 1
+#    if count > 1:
+#        look = line.strip().split("\t")
+#        if int(look[1]) == int(eqtl_chr) and float(look[3]) == float(l[4]):
+#            eqtl_start = float(look[4])
+#        if int(look[1]) == int(eqtl_chr) and float(look[3]) == float(l[6]):
+#            eqtl_end = float(look[4])
+#lookup.close()
+# compare calculated gene cM position (x2) with eQTL start and end cM positions
+# calculate a region around the gene (5 cM to either side) and test whether (i) eQTL start or (ii) eQTL end overlaps with that region; or (iii) whether the eQTL is entirely in that region
+eqtl_start = float(l[4])*100
+eqtl_end = float(l[6])*100
+eqtl_peak = float(l[8])*100
+#cutoff = 5
+if (int(gene_chr) == int(eqtl_chr)) and ((eqtl_start >= x2-float(cutoff) and eqtl_start <= x2+float(cutoff)) or (eqtl_end >= x2-float(cutoff) and eqtl_end <= x2+float(cutoff)) or (eqtl_start <= x2-float(cutoff) and eqtl_end >= x2+float(cutoff))):
+classification = "cis"
+#if options.score_type == str(1):
+#    # calculate overlap score (dM score)
+#    start_end_list = [float(eqtl_start/10), float(eqtl_end/10), (x2-float(cutoff))/10, (x2+float(cutoff))/10]
+#    start_end_list.sort()
+#
+#    ratio = ((start_end_list[2]-start_end_list[1])+1)/float((start_end_list[3]-start_end_list[0])+1)
+#    diff_peaks = float(abs((x2/10)-float(eqtl_peak/10)))+1
+#    overlap_score = float(ratio/diff_peaks)
+#else:
+#    # calculate overlap score (hybrid score: full overlap (both peaks in overlapping region) = M scale; partial overlap = dM scale)
+#    # Example of scale: 0.0801 M = 0.801 dM = 8.01 cM
+start_end_list = [float(eqtl_start/10),float(eqtl_end/10),(x2-float(cutoff))/10, (x2+float(cutoff))/10]
+start_end_list.sort()
+######### test for full / partial overlap #########
+if ((float(eqtl_peak/10) >= start_end_list[1]) and (float(eqtl_peak/10) <= start_end_list[2])) \
+and ((x2/10 >= start_end_list[1]) and (x2/10 <= start_end_list[2])):
+ratio = ((start_end_list[2]/10-start_end_list[1]/10)+1)/float((start_end_list[3]/10-start_end_list[0]/10)+1)
+diff_peaks = float(abs(x2/100-float(eqtl_peak/100)))+1
+overlap_score = float(ratio/diff_peaks)
+full_partial = "full"
+else:
+# partial overlap --> use dM scale (multiply position by 10) --> already done
+ratio = ((start_end_list[2]-start_end_list[1])+1)/float((start_end_list[3]-start_end_list[0])+1)
+diff_peaks = float(abs(x2/10-float(eqtl_peak/10)))+1
+overlap_score = float(ratio/diff_peaks)
+full_partial = "partial"
+else:
+classification = "trans"
+overlap_score = 0
+full_partial = "."
+if classification == "cis":
+cis += 1
+cis_peakLR += float(l[9])
+cis_rsq += float(l[10])
+cis_rtsq += float(l[11])
+g[4] += 1
+if classification == "trans":
+trans += 1
+trans_peakLR += float(l[9])
+trans_rsq += float(l[10])
+trans_rtsq += float(l[11])
+g[5] += 1
+if classification == "no_result":
+no_result += 1
+nr_peakLR += float(l[9])
+nr_rsq += float(l[10])
+nr_rtsq += float(l[11])
+g[6] += 1
+g[3] += 1
+total += 1
+tot_peakLR += float(l[9])
+tot_rsq += float(l[10])
+tot_rtsq += float(l[11])
+try:
+g_bin = gene_bin_dict[l[0]]
+except:
+g_bin = "NA"
+if status == 0:
+#g_bin = "end c"+str(gene_chr)
+g_bin = "NA"
+gene_bin_dict[l[0]] = g_bin
+if x2 == 0.0:
+#g_bin = "start c"+str(gene_chr)
+g_bin = "NA"
+gene_bin_dict[l[0]] = g_bin
+if l[9] != "inf":
+eqtl_outfile.write(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\t"+str(overlap_score)+"\t"+str(full_partial)+"\n")
+else:
+#eqtl_outfile.write(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\n")
+temp_line = "\t".join(l[0:9])+"\t"+str(max_peakLR)+"\t"+"\t".join(l[10:])
+eqtl_outfile.write(temp_line+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\t"+str(overlap_score)+"\t"+str(full_partial)+"\n")
+#print(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin))
+eqtl.close()
+eqtl_outfile.close()
+# write cis and trans classification files
+cis_chr = []
+trans_chr = []
+all_chr = []
+unknown_chr = []
+all_outfile = open(options.output1,'r')    # all classify.txt
+cis_outfile = open(options.output2,'w')    # cis classify.txt
+cis_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tparent_up_reg\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
+trans_outfile = open(options.output3,'w')    # trans classify.txt
+trans_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tparent_up_reg\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
+for allline in all_outfile:
+all_chr.append(allline.strip().split("\t")[2])
+if allline.strip().split("\t")[13] == "cis":
+cis_outfile.write(allline.strip()+"\n")
+cis_chr.append(allline.strip().split("\t")[2])
+if allline.strip().split("\t")[13] == "trans":
+trans_outfile.write(allline.strip()+"\n")
+trans_chr.append(allline.strip().split("\t")[2])
+if allline.strip().split("\t")[13] == "no_result":
+unknown_chr.append(allline.strip().split("\t")[2])
+cis_outfile.close()
+trans_outfile.close()
+all_outfile.close()
+# calcualte summary statistics
+p_cis = round((cis/float(total))*100,2)
+p_trans = round((trans/float(total))*100,2)
+p_no_result = round((no_result/float(total))*100,2)
+p_total = round((total/float(total))*100,2)
+mean_cis_peakLR = round(cis_peakLR/float(cis),1)
+mean_cis_rsq = round(cis_rsq/float(cis),2)
+mean_cis_rtsq = round(cis_rtsq/float(cis),2)
+mean_trans_peakLR = round(trans_peakLR/float(trans),1)
+mean_trans_rsq = round(trans_rsq/float(trans),2)
+mean_trans_rtsq = round(trans_rtsq/float(trans),2)
+if no_result != 0:
+mean_nr_peakLR = round(nr_peakLR/float(no_result),1)
+mean_nr_rsq = round(nr_rsq/float(no_result),2)
+mean_nr_rtsq = round(nr_rtsq/float(no_result),2)
+else:
+mean_nr_peakLR = "-"
+mean_nr_rsq = "-"
+mean_nr_rtsq = "-"
+mean_tot_peakLR = round(tot_peakLR/float(total),1)
+mean_tot_rsq = round(tot_rsq/float(total),2)
+mean_tot_rtsq = round(tot_rtsq/float(total),2)
+cis_summary = "cis\t"+str(cis)+"\t"+ str(p_cis)+"%\t"+str(mean_cis_peakLR)+"\t"+str(mean_cis_rsq)+"\t"+str(mean_cis_rtsq)+"\n"
+trans_summary = "trans\t"+str(trans)+"\t"+ str(p_trans)+"%\t"+str(mean_trans_peakLR)+"\t"+str(mean_trans_rsq)+"\t"+str(mean_trans_rtsq)+"\n"
+no_result_summary = "unknown\t"+str(no_result)+"\t"+ str(p_no_result)+"%\t"+str(mean_nr_peakLR)+"\t"+str(mean_nr_rsq)+"\t"+str(mean_nr_rtsq)+"\n"
+total_summary = "all\t"+str(total)+"\t"+ str(p_total)+"%\t"+str(mean_tot_peakLR)+"\t"+str(mean_tot_rsq)+"\t"+str(mean_tot_rtsq)+"\n"
+summary_outfile = open(options.output4,'w')    # classify_summary.txt
+summary_outfile.write("class\tnumber_eQTLs\tpercentage_eQTLs\taverage_peakLR\taverage_rsq\taverage_rtsq\n")
+summary_outfile.write(cis_summary+trans_summary+no_result_summary+total_summary)
+summary_outfile.close()
+genes_chr = []
+for g in gene_dict.keys():
+genes_chr.append(gene_dict[g][0])
+count_genes = {}
+for i in set(genes_chr):
+count_genes[i] = genes_chr.count(i)
+count_cis = {}
+for i in set(cis_chr):
+count_cis[i] = cis_chr.count(i)
+count_trans = {}
+for i in set(trans_chr):
+count_trans[i] = trans_chr.count(i)
+count_all = {}
+for i in set(all_chr):
+count_all[i] = all_chr.count(i)
+count_unknown = {}
+for i in set(unknown_chr):
+count_unknown[i] = unknown_chr.count(i)
+i = 0
+tot_genes = 0
+tot_cis = 0
+tot_trans = 0
+tot_unknown = 0
+tot_all = 0
+chr_summary_v2 = open(options.output5,'w')
+chr_summary = open(options.input2,'r')
+for line in chr_summary:
+i += 1
+if i == 1:
+chr_summary_v2.write(line.strip()+"\tgenes\tcis eQTL\ttrans eQT\tunknown eQTL\tall eQTL\n")
+else:
+l = line.strip().split("\t")
+if l[0] != "Total":
+try:
+count_unknown_ans = count_unknown[l[0]]
+tot_unknown += count_unknown[l[0]]
+except:
+count_unknown_ans = 0
+tot_unknown = 0
+chr_summary_v2.write(line.strip()+"\t"+str(count_genes[l[0]])+"\t"+str(count_cis[l[0]])+"\t"+str(count_trans[l[0]])+"\t"+str(count_unknown_ans)+"\t"+str(count_all[l[0]])+"\n")
+tot_genes += count_genes[l[0]]
+tot_cis += count_cis[l[0]]
+tot_trans += count_trans[l[0]]
+tot_all += count_all[l[0]]
+else:
+chr_summary_v2.write(line.strip()+"\t"+str(tot_genes)+"\t"+str(tot_cis)+"\t"+str(tot_trans)+"\t"+str(tot_unknown)+"\t"+str(tot_all)+"\n")
+chr_summary_v2.close()
+chr_summary.close()
+gene_pos_v2 = open(options.output6,'w')
+gene_pos_v2.write("gene\tchr\tstart_bp\tend_bp\tnum_eQTL\tnum_cis_eQTL\tnum_trans_eQTL\tnum_unknown_eQTL\tgene_bin\n")
+eqtl_count = 0
+genes_with_eqtl = 0
+cis_count = 0
+genes_with_cis = 0
+trans_count = 0
+genes_with_trans = 0
+only_cis = 0
+only_trans = 0
+cis_and_trans = 0
+cis_or_trans = 0
+for g in gene_dict.keys():
+gene_pos_v2.write(g+"\t"+str(gene_dict[g][0])+"\t"+str(gene_dict[g][1])+"\t"+str(gene_dict[g][2])+"\t"+str(gene_dict[g][3])+"\t"+str(gene_dict[g][4])+"\t"+str(gene_dict[g][5])+"\t"+str(gene_dict[g][6])+"\t"+str(gene_bin_dict[g])+"\n")
+# num all eQTL
+if int(gene_dict[g][3]) != 0:
+genes_with_eqtl += 1
+eqtl_count += int(gene_dict[g][3])
+# num cis eQTL
+if int(gene_dict[g][4]) != 0:
+genes_with_cis += 1
+cis_count += int(gene_dict[g][4])
+# num trans eQTL
+if int(gene_dict[g][5]) != 0:
+genes_with_trans += 1
+trans_count += int(gene_dict[g][5])
+if (int(gene_dict[g][4]) != 0) and (int(gene_dict[g][5]) == 0):
+only_cis += 1
+if (int(gene_dict[g][5]) != 0) and (int(gene_dict[g][4]) == 0):
+only_trans += 1
+if (int(gene_dict[g][5]) != 0) and (int(gene_dict[g][4]) != 0):
+cis_and_trans += 1
+if (int(gene_dict[g][5]) != 0) or (int(gene_dict[g][4]) != 0):
+cis_or_trans += 1
+ave_num_eqtl_per_gene = eqtl_count/float(genes_with_eqtl)
+ave_num_cis_eqtl_per_gene = cis_count/float(genes_with_cis)
+ave_num_trans_eqtl_per_gene = trans_count/float(genes_with_trans)
+#print "Average number of eQTLs per gene with eQTL\t"+str(round(ave_num_eqtl_per_gene,1))
+#print "Average number of cis eQTLs per gene with cis eQTL\t"+str(round(ave_num_cis_eqtl_per_gene,1))
+#print "Average number of trans eQTLs per gene with trans eQTL\t"+str(round(ave_num_trans_eqtl_per_gene,1))
+#print "Number of genes with only cis eQTL (no trans)\t"+str(only_cis)
+#print "Number of genes with only trans eQTL (no cis)\t"+str(only_trans)
+#print "Number of genes with cis and trans eQTL\t"+str(cis_and_trans)
+#print "Number of genes with cis or trans eQTL\t"+str(cis_or_trans)
+eqtl_per_gene = open(options.output7,'w')
+eqtl_per_gene.write("Average number of eQTLs per gene with eQTL\t"+str(round(ave_num_eqtl_per_gene,1))+"\n")
+eqtl_per_gene.write("Average number of cis eQTLs per gene with cis eQTL\t"+str(round(ave_num_cis_eqtl_per_gene,1))+"\n")
+eqtl_per_gene.write("Average number of trans eQTLs per gene with trans eQTL\t"+str(round(ave_num_trans_eqtl_per_gene,1))+"\n")
+eqtl_per_gene.write("Number of genes with only cis eQTL (no trans)\t"+str(only_cis)+" ("+str(round((int(only_cis)/float(cis_or_trans))*100,1))+"%)\n")
+eqtl_per_gene.write("Number of genes with only trans eQTL (no cis)\t"+str(only_trans)+" ("+str(round((int(only_trans)/float(cis_or_trans))*100,1))+"%)\n")
+eqtl_per_gene.write("Number of genes with cis and trans eQTL\t"+str(cis_and_trans)+" ("+str(round((int(cis_and_trans)/float(cis_or_trans))*100,1))+"%)\n")
+eqtl_per_gene.write("Number of genes with cis or trans eQTL\t"+str(cis_or_trans)+" ("+str(round((int(cis_and_trans)/float(cis_and_trans))*100,1))+"%)\n")
+eqtl_per_gene.close()
+gene_pos_v2.close()
+# Create temp direcotry
+tempdir = tempfile.mkdtemp()
+# copy INPUT file to the temp directory
+s = "cp %s %s/all_classification.txt" %(options.output1, tempdir)
+subprocess.call(s, shell=True)
+s = "cp %s %s/lookup.txt" %(options.input4, tempdir)
+subprocess.call(s, shell=True)
+# create R script => save in temp directory
+# generate new header
+new_script = open(tempdir+"/new_script.r","w")
+header = "setwd(\"%s\")" %tempdir
+new_script.write(header+"\n")
+# add script body
+script = open(options.rscript,"r")
+for line in script:
+new_script.write(line.strip()+"\n")
+new_script.close()
+# run R script from temp directory
+s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
+subprocess.call(s, shell=True)
+# run R script from temp directory
+s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
+subprocess.call(s, shell=True)
+os.system("mv %s/Rplot_eQTL_genes_positions.pdf %s" %(tempdir,options.output8))
+if __name__=="__main__":
+__main__()

Mercurial > repos > nanettec > classifier

comparison classifier/classifier.py @ 0:ef9c2044d86a draft