view classifier/classifier.py @ 1:d8547ff82697 draft default tip

Deleted selected files
author nanettec
date Fri, 18 Mar 2016 05:16:02 -0400
parents ef9c2044d86a
children
line wrap: on
line source

"""
@summary: Classify eQTLs as cis or trans - calculate "hybrid" overlap score
@author: nanette.coetzer@gmail.com
@version 5

"""
import optparse, sys
import tempfile
import os, re
import subprocess

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()
    
def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option("-m", "--rscript", default=None, dest="rscript", 
                      help="R script")
    parser.add_option("-i", "--input1", default=None, dest="input1", 
                      help="eQTLs file")
    parser.add_option("-j", "--input2", default=None, dest="input2", 
                      help="Chr summary file")
    parser.add_option("-k", "--input3", default=None, dest="input3", 
                      help="Gene positions file")
    parser.add_option("-l", "--input4", default=None, dest="input4", 
                      help="Lookup file")
    
    #parser.add_option("-m", "--score_type", default=2, dest="score_type", 
    #                  help="Select dM scale (1) or hybrid (2)")

    parser.add_option("-o", "--output1", default=None, dest="output1", 
                      help="All classification file")
    parser.add_option("-p", "--output2", default=None, dest="output2", 
                      help="Cis classification file")
    parser.add_option("-q", "--output3", default=None, dest="output3", 
                      help="Trans classification file")
    parser.add_option("-r", "--output4", default=None, dest="output4", 
                      help="Classification summary file")
    parser.add_option("-s", "--output5", default=None, dest="output5", 
                      help="Chr summary file v2")
    parser.add_option("-t", "--output6", default=None, dest="output6", 
                      help="Gene positions file v2")
    parser.add_option("-u", "--output7", default=None, dest="output7", 
                      help="eQTLs per gene summary file")
    parser.add_option("-v", "--output8", default=None, dest="output8", 
                      help="eQTL vs gene position plot")
    (options, args) = parser.parse_args()
    
    try:
        open(options.input1, "r").close()
    except TypeError, e:
        stop_err("You need to supply the eQTL file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the eQTL file:\n" + str(e))

    try:
        open(options.input2, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Chr summary file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Chr summary file:\n" + str(e))
        
    try:
        open(options.input3, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Gene positions file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Gene positions file:\n" + str(e))
        
    try:
        open(options.input4, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Lookup file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Lookup file:\n" + str(e))

    ##############################################
    lookup_list = []
    
    # Convert eQTL input file, change marker number of last interval in each chromosome
    check2 = ""
    prev_m = "0"
    prev_c = "1"
    r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+")
    indict = {}
    max_cM_dict = {}
    lookup = open(options.input4,'r')
    for line in lookup:
        l = line.strip().split("\t")
        if l[6] == "0":
            #print l
            max_cM_dict[l[1]] = l[4]
        lookup_list.append(l)
        if l[0] != "id" and l[6] != "0":
            c = l[1]
            if prev_c != c:
                indict[prev_c+"\t"+str(float(p))] = prev_c+"\t"+m+"\t"+str(float(p))
            m = l[2]
            p = l[3]
            prev_c = c
    #print max_cM_dict
    
    #add last line of last chromosome
    indict[prev_c+"\t"+p] = prev_c+"\t"+m+"\t"+p

    #print indict
       
    temp_file = tempfile.mktemp()
    eqtl_temp_output = open(temp_file,'w')
    
    eQTL_size = 0
    eQTL_nr = 0
    eqtl_in = open(options.input1,'r')
    for line in eqtl_in:
        l = line.strip().split("\t")
        if l[3].startswith("c"):
            header = 1  # yes
        else:
            header = 0  # no 
        if header != 1:
            sfull = l[3]+"\t"+l[13]+"\t"+str(float(l[14]))
            efull = l[3]+"\t"+l[15]+"\t"+str(float(l[16]))
            pfull = l[3]+"\t"+l[4]+"\t"+str(float(l[5]))
            sshort = l[3]+"\t"+str(float(l[14]))
            eshort = l[3]+"\t"+str(float(l[16]))
            pshort = l[3]+"\t"+str(float(l[5]))

            if sshort in indict.keys():
                sfull = indict[sshort]
            if eshort in indict.keys():
                efull = indict[eshort]
            if pshort in indict.keys():
                pfull = indict[pshort]
            s = sfull.split("\t")
            e = efull.split("\t")
            p = pfull.split("\t")
            eqtl_temp_output.write(l[0]+"\t"+l[1]+"\t"+s[0]+"\t"+s[1]+"\t"+s[2]+"\t"+e[1]+"\t"+e[2]+"\t"+p[1]+"\t"+p[2]+"\t"+l[6]+"\t"+l[8]+"\t"+l[9]+"\t"+l[11]+"\n")
            #print l[0]+"\t"+l[1]+"\t"+s[0]+"\t"+s[1]+"\t"+s[2]+"\t"+e[1]+"\t"+e[2]+"\t"+p[1]+"\t"+p[2]+"\t"+l[6]+"\t"+l[8]+"\t"+l[9]+"\t"+l[11]+"\n"
            
            eQTL_size += (float(l[16])*100)-(float(l[14])*100)
            eQTL_nr += 1
    lookup.close()
    eqtl_in.close()
    eqtl_temp_output.close()
    
    cutoff = (eQTL_size/float(eQTL_nr))/2
    #print "cutoff: ",cutoff
    #############################################
    
    cis = 0
    trans = 0
    no_result = 0
    total = 0
    cis_peakLR = 0
    cis_rsq = 0 
    cis_rtsq = 0
    trans_peakLR = 0
    trans_rsq = 0 
    trans_rtsq = 0
    nr_peakLR = 0
    nr_rsq = 0 
    nr_rtsq = 0
    tot_peakLR = 0
    tot_rsq = 0 
    tot_rtsq = 0

    # Create 2 dictionaries with gene info: gene_dict and gene_bin_dict
    gene_positions = open(options.input3,'r')  # gene_pos.txt
    gene_dict = {}
    gene_bin_dict = {}
    for line in gene_positions:
        l = line.strip().split("\t")
        if l[0].startswith("gene") or l[0].startswith("Gene") or l[0].startswith("name") or l[0].startswith("Name"):
            header = 1  # yes
        else:
            header = 0  # no 
        if header != 1:
            # create a gene dictionary form gene positions file
            gene_dict[l[0]] = l[1:]+[0,0,0,0]
            chr_gene = l[1]
            bp_gene = round((int(l[2])+int(l[3]))/2.0,0)

            # for each gene, get the correct bin from the lookup file
            # first read lookup file into a list of lists (lookup_list), faster
            gene_bin = "NA"
            #print lookup_list
            for lu in lookup_list:
                if (lu[1]==chr_gene) and (bp_gene>=int(float(lu[5]))) and (float(lu[6])!=0):
                    gene_bin = lu[0]
            gene_bin_dict[l[0]]=gene_bin
            
    gene_positions.close()
    #print "Done 2"
    eqtl_outfile = open(options.output1,'w')    # classify.txt
    eqtl_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tadditive\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
    
    max_peakLR = 0.0
    eqtl_get_maxLR = open(temp_file,'r')
    for eqtlline in eqtl_get_maxLR:
        l = eqtlline.strip().split("\t")
        if l[1].startswith("i"):
            header = 1  # yes
        else:
            header = 0  # no 
        if header != 1:
            if ((l[9] != 'inf') and (float(l[9]) > max_peakLR)):
                max_peakLR = float(l[9])
    eqtl_get_maxLR.close()
                
    count = 0
    position = 0
    status = 0
    x2 = ""
    
    eqtl = open(temp_file,'r')  # eQTLs.txt
    for eqtlline in eqtl:
        classification = ""
        l = eqtlline.strip().split("\t")
        if l[1].startswith("i"):
            header = 1  # yes
        else:
            header = 0  # no 
        if header != 1:
            gene = l[0]
            # for each eqtl, get the correct bin from the lookup file (lookup_list)
            eqtl_bin = "NA"
            for lu in lookup_list:
                if (lu[1]==l[2]) and (float(l[8])>=float(lu[3])) and (float(lu[6])!=0):
                    eqtl_bin = lu[0]
            try: 
                g = gene_dict[gene]
                position = 1
            except:
                # if the position of the gene is not known, then eQTL cannot be classified
                classification = "no_result"
                overlap_score = 0
                position = 0
                full_partial = "."
            if position == 1:
                gene_chr = g[0]
                eqtl_chr = l[2]
                # if the gene and eQTL on different chromosomes, then eQTL classified as trans
                if gene_chr != eqtl_chr:
                    classification = "trans"
                    overlap_score = 0
                    full_partial = "."
                # if the gene and eQTL on the same chromosome, then test if cis / trans
                else:
                    #print "else"
                    gene_bp = int(round((int(g[1])+int(g[2]))/float(2),0))
                    prev_bp = 0
                    prev_line = [0,0,0,0.0,0.0,0]
                    lookup = open(options.input4,'r')  # lookup.txt
                    status = 0
                    for line in lookup:
                        look = line.strip().split("\t")
                        if look[1] == gene_chr:
                            # for the gene position in bp (v1), calculate the best estimate cM position (x2)
                            # if gene is NOT to the left of the 1st marker (< 0 cM) or to the right of the last marker, x2 will be calculated and status will be 1

                            if (int(float(gene_bp)) > int(float(prev_bp))) and (int(float(gene_bp)) <= int(float(look[5]))):
                                # 2 : cM and 1 : bp
                                #print "gene_bp = "+str(gene_bp)
                                #print "prev_bp = "+str(prev_bp)
                                #print "look[5] = "+str(look[5])
                                v1 = int(gene_bp)
                                s1 = int(prev_bp)
                                l1 = int(float(look[5]))                    
                                s2 = float(prev_line[4])
                                l2 = float(look[4])
                                x2 = round(((v1-s1)*(l2-s2)/(l1-s1))+s2,2) 
                                status = 1
                                #print "x2 = "+str(x2)
                            prev_bp = look[5]
                            prev_line = look    
                    lookup.close()
                    #print "status = "+str(status)+"\n"
                    if status == 0:
                        #print "*********" * 100
                        #print "gene at end of chr\n\n\n"
                        x2 = round(float(max_cM_dict[gene_chr]),2)
                        #print x2
                        g_bin = "NA"
                        # if the gene is to the left of the 1st marker (< 0 cM) or to the right of the last marker; this could be improved (for the maize data it is fine)
                        #classification = "no_result"
                    #print classification
                    if classification != "if status == 0:":
                        # get the eQTL start and end cM positions (from lookup table)
                        #lookup = open(options.input4,'r')  # lookup.txt
                        #count = 0
                        #for line in lookup:
                        #    count += 1
                        #    if count > 1:
                        #        look = line.strip().split("\t")
                        #        if int(look[1]) == int(eqtl_chr) and float(look[3]) == float(l[4]):
                        #            eqtl_start = float(look[4])                            
                        #        if int(look[1]) == int(eqtl_chr) and float(look[3]) == float(l[6]):
                        #            eqtl_end = float(look[4])
                        #lookup.close()
                        # compare calculated gene cM position (x2) with eQTL start and end cM positions
                        # calculate a region around the gene (5 cM to either side) and test whether (i) eQTL start or (ii) eQTL end overlaps with that region; or (iii) whether the eQTL is entirely in that region
                        eqtl_start = float(l[4])*100
                        eqtl_end = float(l[6])*100
                        eqtl_peak = float(l[8])*100
                        #cutoff = 5
                        
                        if (int(gene_chr) == int(eqtl_chr)) and ((eqtl_start >= x2-float(cutoff) and eqtl_start <= x2+float(cutoff)) or (eqtl_end >= x2-float(cutoff) and eqtl_end <= x2+float(cutoff)) or (eqtl_start <= x2-float(cutoff) and eqtl_end >= x2+float(cutoff))):
                            classification = "cis"
                            
                            #if options.score_type == str(1):
                            #    # calculate overlap score (dM score)
                            #    start_end_list = [float(eqtl_start/10), float(eqtl_end/10), (x2-float(cutoff))/10, (x2+float(cutoff))/10]						
                            #    start_end_list.sort()
                            #   
                            #    ratio = ((start_end_list[2]-start_end_list[1])+1)/float((start_end_list[3]-start_end_list[0])+1)
                            #    diff_peaks = float(abs((x2/10)-float(eqtl_peak/10)))+1
                            #    overlap_score = float(ratio/diff_peaks)                                     
                            #else:
                            #    # calculate overlap score (hybrid score: full overlap (both peaks in overlapping region) = M scale; partial overlap = dM scale)
                            #    # Example of scale: 0.0801 M = 0.801 dM = 8.01 cM
                            
                            start_end_list = [float(eqtl_start/10),float(eqtl_end/10),(x2-float(cutoff))/10, (x2+float(cutoff))/10]
                            start_end_list.sort()
                            ######### test for full / partial overlap #########
                            if ((float(eqtl_peak/10) >= start_end_list[1]) and (float(eqtl_peak/10) <= start_end_list[2])) \
                            and ((x2/10 >= start_end_list[1]) and (x2/10 <= start_end_list[2])):
                                ratio = ((start_end_list[2]/10-start_end_list[1]/10)+1)/float((start_end_list[3]/10-start_end_list[0]/10)+1)
                                diff_peaks = float(abs(x2/100-float(eqtl_peak/100)))+1
                                overlap_score = float(ratio/diff_peaks)
                                full_partial = "full"
                    
                            else:
                                # partial overlap --> use dM scale (multiply position by 10) --> already done
                                ratio = ((start_end_list[2]-start_end_list[1])+1)/float((start_end_list[3]-start_end_list[0])+1)
                                diff_peaks = float(abs(x2/10-float(eqtl_peak/10)))+1
                                overlap_score = float(ratio/diff_peaks)
                                full_partial = "partial"                                
                    
                        else:
                            classification = "trans"
                            overlap_score = 0
                            full_partial = "."
            if classification == "cis":
                cis += 1
                cis_peakLR += float(l[9])
                cis_rsq += float(l[10])
                cis_rtsq += float(l[11])
                g[4] += 1
            if classification == "trans":
                trans += 1
                trans_peakLR += float(l[9])
                trans_rsq += float(l[10])
                trans_rtsq += float(l[11])
                g[5] += 1
            if classification == "no_result":
                no_result += 1
                nr_peakLR += float(l[9])
                nr_rsq += float(l[10])
                nr_rtsq += float(l[11])
                g[6] += 1
            g[3] += 1
            total += 1
            tot_peakLR += float(l[9])
            tot_rsq += float(l[10])
            tot_rtsq += float(l[11])
            try:
                g_bin = gene_bin_dict[l[0]]
            except:
                g_bin = "NA"
            if status == 0:
                #g_bin = "end c"+str(gene_chr)
                g_bin = "NA"
                gene_bin_dict[l[0]] = g_bin
            if x2 == 0.0:
                #g_bin = "start c"+str(gene_chr)
                g_bin = "NA"
                gene_bin_dict[l[0]] = g_bin
            if l[9] != "inf":
                eqtl_outfile.write(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\t"+str(overlap_score)+"\t"+str(full_partial)+"\n")
            else:
                #eqtl_outfile.write(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\n")
                temp_line = "\t".join(l[0:9])+"\t"+str(max_peakLR)+"\t"+"\t".join(l[10:])
                eqtl_outfile.write(temp_line+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin)+"\t"+str(overlap_score)+"\t"+str(full_partial)+"\n")
            #print(eqtlline.strip()+"\t"+classification+"\t"+eqtl_bin+"\t"+str(g_bin))
    eqtl.close()
    eqtl_outfile.close()
    
    # write cis and trans classification files
    cis_chr = []
    trans_chr = []
    all_chr = []
    unknown_chr = []
    all_outfile = open(options.output1,'r')    # all classify.txt
    cis_outfile = open(options.output2,'w')    # cis classify.txt
    cis_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tparent_up_reg\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
    trans_outfile = open(options.output3,'w')    # trans classify.txt
    trans_outfile.write("gene\tindex\tchr\tstart_marker\tstart_int\tend_marker\tend_int\tpeak_marker\tpeak_int\tpeakLR\trsq\trtsq\tparent_up_reg\tclassification\teQTL_bin\tgene_bin\toverlap_score\tstatus\n")
    for allline in all_outfile:
            all_chr.append(allline.strip().split("\t")[2])
            if allline.strip().split("\t")[13] == "cis":
                cis_outfile.write(allline.strip()+"\n")
                cis_chr.append(allline.strip().split("\t")[2])
            if allline.strip().split("\t")[13] == "trans":
                trans_outfile.write(allline.strip()+"\n")
                trans_chr.append(allline.strip().split("\t")[2])
            if allline.strip().split("\t")[13] == "no_result":
                unknown_chr.append(allline.strip().split("\t")[2])
    cis_outfile.close()
    trans_outfile.close()
    all_outfile.close()
    
    # calcualte summary statistics
    p_cis = round((cis/float(total))*100,2)
    p_trans = round((trans/float(total))*100,2)
    p_no_result = round((no_result/float(total))*100,2)
    p_total = round((total/float(total))*100,2)
    
    mean_cis_peakLR = round(cis_peakLR/float(cis),1)
    mean_cis_rsq = round(cis_rsq/float(cis),2)
    mean_cis_rtsq = round(cis_rtsq/float(cis),2)
    
    mean_trans_peakLR = round(trans_peakLR/float(trans),1)
    mean_trans_rsq = round(trans_rsq/float(trans),2)
    mean_trans_rtsq = round(trans_rtsq/float(trans),2)
    
    if no_result != 0:
        mean_nr_peakLR = round(nr_peakLR/float(no_result),1)
        mean_nr_rsq = round(nr_rsq/float(no_result),2)
        mean_nr_rtsq = round(nr_rtsq/float(no_result),2)
    else: 
        mean_nr_peakLR = "-"
        mean_nr_rsq = "-"
        mean_nr_rtsq = "-"

    mean_tot_peakLR = round(tot_peakLR/float(total),1)
    mean_tot_rsq = round(tot_rsq/float(total),2)
    mean_tot_rtsq = round(tot_rtsq/float(total),2)
    
    cis_summary = "cis\t"+str(cis)+"\t"+ str(p_cis)+"%\t"+str(mean_cis_peakLR)+"\t"+str(mean_cis_rsq)+"\t"+str(mean_cis_rtsq)+"\n"
    trans_summary = "trans\t"+str(trans)+"\t"+ str(p_trans)+"%\t"+str(mean_trans_peakLR)+"\t"+str(mean_trans_rsq)+"\t"+str(mean_trans_rtsq)+"\n"
    no_result_summary = "unknown\t"+str(no_result)+"\t"+ str(p_no_result)+"%\t"+str(mean_nr_peakLR)+"\t"+str(mean_nr_rsq)+"\t"+str(mean_nr_rtsq)+"\n"
    total_summary = "all\t"+str(total)+"\t"+ str(p_total)+"%\t"+str(mean_tot_peakLR)+"\t"+str(mean_tot_rsq)+"\t"+str(mean_tot_rtsq)+"\n"
    
    summary_outfile = open(options.output4,'w')    # classify_summary.txt
    summary_outfile.write("class\tnumber_eQTLs\tpercentage_eQTLs\taverage_peakLR\taverage_rsq\taverage_rtsq\n")
    summary_outfile.write(cis_summary+trans_summary+no_result_summary+total_summary)       
    summary_outfile.close()
    
    
    genes_chr = []
    for g in gene_dict.keys():
        genes_chr.append(gene_dict[g][0])
        
    count_genes = {}
    for i in set(genes_chr):
        count_genes[i] = genes_chr.count(i)
    count_cis = {}
    for i in set(cis_chr):
        count_cis[i] = cis_chr.count(i)   
    count_trans = {}
    for i in set(trans_chr):
        count_trans[i] = trans_chr.count(i)
    count_all = {}
    for i in set(all_chr):
        count_all[i] = all_chr.count(i)   
    count_unknown = {}
    for i in set(unknown_chr):
        count_unknown[i] = unknown_chr.count(i)

    i = 0
    tot_genes = 0
    tot_cis = 0
    tot_trans = 0
    tot_unknown = 0
    tot_all = 0
    chr_summary_v2 = open(options.output5,'w')     
    chr_summary = open(options.input2,'r')
    for line in chr_summary:
        i += 1
        if i == 1:
            chr_summary_v2.write(line.strip()+"\tgenes\tcis eQTL\ttrans eQT\tunknown eQTL\tall eQTL\n")
        else:
            l = line.strip().split("\t")
            if l[0] != "Total":
                try:
                    count_unknown_ans = count_unknown[l[0]]
                    tot_unknown += count_unknown[l[0]]
                except:
                    count_unknown_ans = 0
                    tot_unknown = 0
                chr_summary_v2.write(line.strip()+"\t"+str(count_genes[l[0]])+"\t"+str(count_cis[l[0]])+"\t"+str(count_trans[l[0]])+"\t"+str(count_unknown_ans)+"\t"+str(count_all[l[0]])+"\n")
                tot_genes += count_genes[l[0]]
                tot_cis += count_cis[l[0]]
                tot_trans += count_trans[l[0]]
                tot_all += count_all[l[0]]
            else:
                chr_summary_v2.write(line.strip()+"\t"+str(tot_genes)+"\t"+str(tot_cis)+"\t"+str(tot_trans)+"\t"+str(tot_unknown)+"\t"+str(tot_all)+"\n")
    chr_summary_v2.close()
    chr_summary.close()
    
    gene_pos_v2 = open(options.output6,'w')
    gene_pos_v2.write("gene\tchr\tstart_bp\tend_bp\tnum_eQTL\tnum_cis_eQTL\tnum_trans_eQTL\tnum_unknown_eQTL\tgene_bin\n")
    
    eqtl_count = 0
    genes_with_eqtl = 0
    cis_count = 0
    genes_with_cis = 0
    trans_count = 0
    genes_with_trans = 0
    only_cis = 0
    only_trans = 0
    cis_and_trans = 0
    cis_or_trans = 0
    for g in gene_dict.keys():
        gene_pos_v2.write(g+"\t"+str(gene_dict[g][0])+"\t"+str(gene_dict[g][1])+"\t"+str(gene_dict[g][2])+"\t"+str(gene_dict[g][3])+"\t"+str(gene_dict[g][4])+"\t"+str(gene_dict[g][5])+"\t"+str(gene_dict[g][6])+"\t"+str(gene_bin_dict[g])+"\n")
        # num all eQTL
        if int(gene_dict[g][3]) != 0:
            genes_with_eqtl += 1
            eqtl_count += int(gene_dict[g][3])
        # num cis eQTL
        if int(gene_dict[g][4]) != 0:
            genes_with_cis += 1
            cis_count += int(gene_dict[g][4])
        # num trans eQTL
        if int(gene_dict[g][5]) != 0:
            genes_with_trans += 1
            trans_count += int(gene_dict[g][5])
        if (int(gene_dict[g][4]) != 0) and (int(gene_dict[g][5]) == 0):
            only_cis += 1
        if (int(gene_dict[g][5]) != 0) and (int(gene_dict[g][4]) == 0):
            only_trans += 1
        if (int(gene_dict[g][5]) != 0) and (int(gene_dict[g][4]) != 0):
            cis_and_trans += 1
        if (int(gene_dict[g][5]) != 0) or (int(gene_dict[g][4]) != 0):
            cis_or_trans += 1
            
    ave_num_eqtl_per_gene = eqtl_count/float(genes_with_eqtl)
    ave_num_cis_eqtl_per_gene = cis_count/float(genes_with_cis)
    ave_num_trans_eqtl_per_gene = trans_count/float(genes_with_trans)
    #print "Average number of eQTLs per gene with eQTL\t"+str(round(ave_num_eqtl_per_gene,1))
    #print "Average number of cis eQTLs per gene with cis eQTL\t"+str(round(ave_num_cis_eqtl_per_gene,1))
    #print "Average number of trans eQTLs per gene with trans eQTL\t"+str(round(ave_num_trans_eqtl_per_gene,1))
    #print "Number of genes with only cis eQTL (no trans)\t"+str(only_cis)
    #print "Number of genes with only trans eQTL (no cis)\t"+str(only_trans)
    #print "Number of genes with cis and trans eQTL\t"+str(cis_and_trans)
    #print "Number of genes with cis or trans eQTL\t"+str(cis_or_trans)
    
    eqtl_per_gene = open(options.output7,'w')
    eqtl_per_gene.write("Average number of eQTLs per gene with eQTL\t"+str(round(ave_num_eqtl_per_gene,1))+"\n")
    eqtl_per_gene.write("Average number of cis eQTLs per gene with cis eQTL\t"+str(round(ave_num_cis_eqtl_per_gene,1))+"\n")
    eqtl_per_gene.write("Average number of trans eQTLs per gene with trans eQTL\t"+str(round(ave_num_trans_eqtl_per_gene,1))+"\n")
    eqtl_per_gene.write("Number of genes with only cis eQTL (no trans)\t"+str(only_cis)+" ("+str(round((int(only_cis)/float(cis_or_trans))*100,1))+"%)\n")
    eqtl_per_gene.write("Number of genes with only trans eQTL (no cis)\t"+str(only_trans)+" ("+str(round((int(only_trans)/float(cis_or_trans))*100,1))+"%)\n")
    eqtl_per_gene.write("Number of genes with cis and trans eQTL\t"+str(cis_and_trans)+" ("+str(round((int(cis_and_trans)/float(cis_or_trans))*100,1))+"%)\n")
    eqtl_per_gene.write("Number of genes with cis or trans eQTL\t"+str(cis_or_trans)+" ("+str(round((int(cis_and_trans)/float(cis_and_trans))*100,1))+"%)\n")
    eqtl_per_gene.close()
    
    gene_pos_v2.close()
    
    # Create temp direcotry
    tempdir = tempfile.mkdtemp()

    # copy INPUT file to the temp directory
    s = "cp %s %s/all_classification.txt" %(options.output1, tempdir)
    subprocess.call(s, shell=True)
    s = "cp %s %s/lookup.txt" %(options.input4, tempdir)
    subprocess.call(s, shell=True)

    # create R script => save in temp directory
    # generate new header 
    new_script = open(tempdir+"/new_script.r","w")
    header = "setwd(\"%s\")" %tempdir
    new_script.write(header+"\n")
    # add script body
    script = open(options.rscript,"r")
    for line in script:
        new_script.write(line.strip()+"\n")
    new_script.close()
    
    # run R script from temp directory
    s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
    subprocess.call(s, shell=True)
    
    # run R script from temp directory
    s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
    subprocess.call(s, shell=True)
    
    os.system("mv %s/Rplot_eQTL_genes_positions.pdf %s" %(tempdir,options.output8))
    
if __name__=="__main__": 
    __main__()