Mercurial > repos > youngkim > ezbamqc
view ezBAMQC/ezBAMQC @ 11:5bfcc6c131ed
Uploaded
author | cshl-bsr |
---|---|
date | Wed, 30 Mar 2016 12:14:21 -0400 |
parents | 6610eedd9fae |
children |
line wrap: on
line source
#!/usr/bin/env python2.7 ''' Created on July 24, 2015 @author: Ying Jin @contact: yjin@cshl.edu @status: @version: 0.6.7 ''' import argparse, subprocess,traceback import sys, os, time, string, re import warnings, logging import collections import math, copy import sets from time import strftime from datetime import datetime import ctypes import multiprocessing,threading,Queue def locate(name, path): for root, dirs, files in os.walk(path): if name in files: return os.path.join(root, name) def locBAMqc(loc): for p in os.environ[loc].split(os.pathsep): potential_file = locate('libBAMqc.so',p) if potential_file: return potential_file in_path = locBAMqc('PATH') in_pythonpath = locBAMqc('PYTHONPATH') in_local = locate('libBAMqc.so','./') if in_local: so=ctypes.CDLL(in_local) elif in_path: so=ctypes.CDLL(in_path) elif in_pythonpath: so=ctypes.CDLL(in_pythonpath) else: print "can not find libBAMqc.so, you're not setup correctly, exiting\n" sys.exit() if sys.version_info[0] != 2 or sys.version_info[1] != 7: print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " ezBAMQC needs python2.7!\n" sys.exit() class pyResults : def __init__(self): self.filename = "" self.is_pairEnd = False self.clipping_plot_file = "" self.mapq_plot_file = "" self.mapq_file = "" self.read_cov_plot_file = "" self.trans_cov_plot_file = "" self.insert_plot_file = "" self.insert_file = "" self.read_dist_plot_file1 = "" self.read_dist_plot_file2 = "" self.read_dup_plot_file = "" self.readLen_plot_file = "" self.geneCount_file = "" self.seqDeDup_percent = 0 self.posDeDup_percent = 0 self.no_clipping = False self.no_rRNA = False self.total_reads = 0 self.uniq_mapped_reads = 0 self.multi_mapped_reads = 0 self.unmapped_reads = 0 self.low_qual = 0 self.low_qual_read1 = 0 self.low_qual_read2 = 0 self.pcr_dup = 0 self.unmapped_read1 = 0 self.unmapped_read2 = 0 self.mapped_read1 = 0 self.mapped_read2 = 0 self.forward_read = 0 self.reverse_read = 0 self.paired_reads = 0 self.mapped_plus_minus = 0 self.mapped_plus_plus = 0 self.mapped_minus_plus = 0 self.mapped_minus_minus = 0 self.ins_read = 0 self.del_read = 0 self.noSplice = 0 self.splice = 0 self.paired_diff_chrom = 0 self.rRNA_read = 0 self.intron_read = 0 self.cds_exon_read = 0 self.utr_5_read = 0 self.utr_3_read = 0 self.intergenic_up1kb_read = 0 self.intergenic_down1kb_read = 0 self.intergenic_read = 0 def read_in_res(cur_data_dir,label): res = pyResults() fname = cur_data_dir+label+'.res.txt' try : if os.path.exists(fname) : f = open(fname,'r') for line in f : line = line.strip() (name,value) = line.split('\t') if name == "rRNA_read" : res.rRNA_read = int(value) if name == "low_qual_read1" : res.low_qual_read1 = int(value) if name == "low_qual_read2" : res.low_qual_read2 = int(value) if name == "filename" : res.filename = value if name == "is_pairEnd" : res.is_pairEnd = True if int(value) == 1 else False if name == "clipping_plot_file" : res.clipping_plot_file = value if name == "mapq_plot_file" : res.mapq_plot_file = value if name == "mapq_file" : res.mapq_file = value if name == "read_cov_plot_file": res.read_cov_plot_file = value if name == "trans_cov_plot_file": res.trans_cov_plot_file = value if name == "insert_plot_file" : res.insert_plot_file = value if name == "insert_file" : res.insert_file = value if name == "read_dist_plot_file1" : res.read_dist_plot_file1 = value if name == "read_dist_plot_file2" : res.read_dist_plot_file2 = value if name == "read_dup_plot_file" : res.read_dup_plot_file = value if name == "readLen_plot_file" : res.readLen_plot_file = value if name == "geneCount_file" : res.geneCount_file = value if name == "seqDeDup_percent" : res.seqDeDup_percent = float(value) if name == "posDeDup_percent" : res.posDeDup_percent = float(value) if name == "no_clipping" : res.no_clipping = False if int(value) == 0 else True if name == "no_rRNA" : res.no_rRNA = False if int(value) == 0 else True if name == "total_reads" : res.total_reads = int(value) if name == "uniq_mapped_reads" : res.uniq_mapped_reads = int(value) if name == "multi_mapped_reads" : res.multi_mapped_reads = int(value) if name == "unmapped_reads" : res.unmapped_reads = int(value) if name =="low_qual" : res.low_qual = int(value) if name == "pcr_dup" : res.pcr_dup = int(value) if name == "unmapped_read1" : res.unmapped_read1 = int(value) if name == "unmapped_read2" : res.unmapped_read2 = int(value) if name =="mapped_read1": res.mapped_read1 = int(value) if name =="mapped_read2": res.mapped_read2 = int(value) if name =="forward_read": res.forward_read = int(value) if name =="reverse_read" : res.reverse_read = int(value) if name =="paired_reads": res.paired_reads = int(value) if name =="mapped_plus_minus": res.mapped_plus_minus = int(value) if name =="mapped_plus_plus": res.mapped_plus_plus = int(value) if name == "mapped_minus_plus" : res.mapped_minus_plus = int(value) if name =="mapped_minus_minus" : res.mapped_minus_minus = int(value) if name =="ins_read" : res.ins_read = int(value) if name == "del_read" : res.del_read = int(value) if name == "noSplice" : res.noSplice = int(value) if name =="splice": res.splice = int(value) if name =="paired_diff_chrom" : res.paired_diff_chrom = int(value) f.close() #os.remove(fname) else : sys.stderr.write("output file does not exist for sample %s\n" % (label)) except : sys.stderr.write("Error in reading output.\n") sys.exit(1) return res def worker(in_queue,out_queue) : #for (label,fname,param) in iter(in_queue.get,'STOP'): (fname,label,cur_data_dir,cur_fig_dir,rRNA_model,ref_gene_model,attrID,mapq,stranded) = in_queue.get() if fname is not None : ret = so.run_qc(cur_data_dir,cur_fig_dir,ref_gene_model,attrID,fname,rRNA_model,label,mapq,stranded) if ret == 1 : res = read_in_res(cur_data_dir,label) create_per_sample_plot(fname,label,cur_data_dir,cur_fig_dir,res) out_queue.put((label,res)) def distr_jobs2(args,cur_data_dir,cur_fig_dir): smp_res = dict() try: #mgr = multiprocessing.Manager() #param = [cur_data_dir,cur_fig_dir,args.mapq,args.stranded] if args.numProc <= len(args.ifiles) : num_process = args.numProc else : num_process = len(args.ifiles) processed = 0 while processed < len(args.ifiles) : task_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() if processed + num_process > len(args.ifiles) : num_process = len(args.ifiles) - processed for i in range(processed,processed+num_process) : label = args.labels[i] fname = args.ifiles[i] t = ((fname,label,cur_data_dir,cur_fig_dir,args.rRNA_model,args.ref_gene_model,args.attrID,args.mapq,args.stranded)) task_queue.put(t) procs = [] for i in range(num_process) : p = multiprocessing.Process(target=worker,args=(task_queue,result_queue)) #p.daemon=True p.start() procs.append(p) #task_queue.put('STOP') finished = 0 for p in procs : p.join() #sys.stderr.write(str(p.exitcode)+"\n") if p.exitcode != 0 : sys.stderr.write("subprocess error %s " % (p.exitcode)) sys.exit(1) if p.exitcode == 0 : finished += 1 processed += 1 if finished == len(procs) : break for i in range(num_process) : (label,res) = result_queue.get() smp_res[label] = res except: sys.stderr.write("Error: %s\n" % str(sys.exc_info()[1])) sys.stderr.write( "[Exception type: %s, raised in %s:%d]\n" % ( sys.exc_info()[1].__class__.__name__, os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]), traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) ) sys.exit(1) return smp_res def main(): #read in options args = read_opts(prepare_parser()) info = args.info warn = args.warn debug = args.debug error = args.error crit = args.critical #local_rRNAIdx = None #local_geneIdx = None info("*** Starting BAMqc run. ***\n") #list of qc results smp_res = dict() #working directory and output files cur_dir = os.path.abspath(args.dir) cur_base_dir = os.path.basename(args.dir) cur_fig_dir = cur_dir+"/figs/" cur_data_dir = cur_dir + "/data/" data_file = cur_dir+"summary_data.txt" html_file = cur_dir+"/ezBAMQC_output.html" #check folders and files try : if os.path.exists(cur_dir) : error("Folder already exists!\n") sys.exit(1) if not os.path.exists(cur_dir) : os.makedirs(cur_dir) if not os.path.exists(cur_fig_dir): os.makedirs(cur_fig_dir) if not os.path.exists(cur_data_dir): os.makedirs(cur_data_dir) except : error("Error in create output folder.\n") sys.exit(1) if args.numProc >=2 : smp_res = distr_jobs2(args,cur_data_dir,cur_fig_dir) else : for i in range(len(args.ifiles)) : ifile = args.ifiles[i] ret = so.run_qc(cur_data_dir,cur_fig_dir,args.ref_gene_model,args.attrID,ifile,args.rRNA_model,args.labels[i],args.mapq,args.stranded,args.numThreads) if ret == 1 : res = read_in_res(cur_data_dir,args.labels[i]) create_per_sample_plot(ifile,args.labels[i],cur_data_dir,cur_fig_dir,res) smp_res[args.labels[i]] = res #sample correlation smp_corr_plot_file = cur_fig_dir+"smp_corr.png" smp_repro_plot_file = cur_fig_dir+"smp_reproducibility.png" smp_var_plot_file = cur_fig_dir+"smp_var.png" smp_inner_plot_file = cur_fig_dir+"smp_inner_dist.png" smp_cov_plot_file = cur_fig_dir+"smp_cov.png" smp_quality_plot_file = cur_fig_dir+"smp_qual.png" #smp_dup_plot_file = cur_fig_dir+"smp_dup.png" #smp_summary_file = cur_data_dir +"smp_summary.txt" if len(smp_res) > 1 : smp_cnt = 0 header_corr = 'c(' header_insert = 'c(' header_mapq = 'c(' #header_dup = 'c(' #sys.stderr.write(','.join(smp_res.values()+"\n")) filenames_corr = 'c(' filenames_insert = 'c(' filenames_mapq = 'c(' #filenames_dup_seq = 'c(' #filenames_dup_pos = 'c(' pe_smp_cnt = 0 for k in range(len(args.labels)) : key = args.labels[k] if not smp_res[key].mapq_file == "" : filenames_mapq += '"' + cur_base_dir+ "/data/" + os.path.basename(smp_res[key].mapq_file) + '",' header_mapq += '"'+key+'",' if not smp_res[key].insert_file == "" and smp_res[key].is_pairEnd: pe_smp_cnt += 1 filenames_insert += '"' + cur_base_dir + "/data/"+os.path.basename(smp_res[key].insert_file) + '",' header_insert += '"'+key+'",' if not smp_res[key].geneCount_file == "" : smp_cnt += 1 filenames_corr += '"' + cur_base_dir + "/data/"+ os.path.basename(smp_res[key].geneCount_file) + '",' header_corr += '"'+key+'",' header_corr = header_corr[0:len(header_corr)-1] + ')' filenames_corr = filenames_corr[0:len(filenames_corr)-1] + ')' filenames_insert = filenames_insert[0:len(filenames_insert)-1] + ')' header_insert = header_insert[0:len(header_insert)-1] + ')' header_mapq = header_mapq[0:len(header_mapq)-1] + ')' filenames_mapq = filenames_mapq[0:len(filenames_mapq)-1] + ')' #header_dup = header_dup[0:len(header_dup)-1] + ')' #filenames_dup_seq = filenames_dup_seq[0:len(filenames_dup_seq)-1] + ')' #filenames_dup_pos = filenames_dup_pos[0:len(filenames_dup_pos)-1] + ')' if smp_cnt > 1 : info("*** Sample Correlation ***") try : #subprocess.call(cmd_str+" >"+smp_summary_file, shell=True) smp_corr_r = cur_data_dir+"smp_correlation.r" f = open(smp_corr_r,'w') f.write("library(corrplot)\n") f.write('srcfiles = '+filenames_corr+'\n') f.write('destfile = "'+smp_corr_plot_file+'"\n') f.write('f1 = read.delim(srcfiles[1],header=T)\n') f.write('MM=matrix(nrow=length(f1[,1]),ncol=length(srcfiles))\n') f.write('rownames(MM)=f1[,1]\n') f.write('MM[,1]=f1[,2]\n') f.write('for (i in 2:length(srcfiles)){ \n') f.write(' f = read.delim(srcfiles[i],header=T)\n') f.write(' MM[,i] = f[,2] }\n') f.write('colnames(MM)='+header_corr+'\n') f.write('libSize<-colSums(MM)\n') f.write('MM<-t(t(MM)*1000000/libSize)\n') f.write('ss<-rowSums(MM)\n') f.write('M1<-MM[ss>0,]\n') f.write('MM_s<-t(scale(t(M1)))\n') f.write("M.cor<-cor(MM_s,method='sp')\n") f.write("M.cor[is.na(M.cor)]<- 0\n") f.write("png(destfile,width=500,height=500,units='px')\n") f.write("corrplot(M.cor,is.corr=T,order='FPC',method='color',type='full',add=F,diag=T)\n") f.write("dev.state = dev.off()\n") f.write("nz_genes = length(M1[,1])\n") f.write('destfile = "'+smp_repro_plot_file+'"\n') f.write("if(nz_genes >0) { \n") f.write("png(destfile,width=500,height=500,units='px')\n") f.write("nz_gene_mm = rep(0,length(M1[1,]))\n") f.write("for(i in 1:length(M1[1,])) { \n") f.write("nz_gene_mm[i] = length(which(M1[,i]>0))/nz_genes * 100 } \n") f.write("bplt <- barplot(nz_gene_mm,beside=T,border='NA',space=1.5,ylim=c(0,100),ylab='Genes reproducibly detected (%)',col='blue',names.arg=colnames(MM),las=2)\n") f.write("text(y= nz_gene_mm+2, x= bplt, labels=paste(as.character(round(nz_gene_mm,digits=1)),'%',sep=''), xpd=TRUE)\n") f.write("dev.state = dev.off()}\n") f.write('destfile = "'+smp_var_plot_file+'"\n') f.write("png(destfile,width=500,height=500,units='px')\n") f.write("mad = rep(0,length(M1[,1]))\n") f.write("nz_gene_median = rep(0,length(M1[,1]))\n") f.write("for(i in 1:length(M1[,1])) { \n") f.write("nz_gene_median[i] = median(M1[i,]) \n") f.write("mad[i] = median(abs(M1[i,]-nz_gene_median[i])) } \n") f.write("mad2 = mad[nz_gene_median >0] \n") f.write("nz_gene_median2 = nz_gene_median[nz_gene_median>0] \n") f.write("mad_vs_median = mad2/nz_gene_median2 \n") f.write("nz_gene_median3 = log(nz_gene_median2, base=2)\n") f.write("dd<-data.frame(nz_gene_median3,mad_vs_median) \n") f.write("x = densCols(nz_gene_median3,mad_vs_median, colramp=colorRampPalette(c('black', 'white')))\n") f.write("dd$dens <- col2rgb(x)[1,] + 1L \n") f.write('cols <- colorRampPalette(c("#000099", "#00FEFF", "#45FE4F", "#FCFF00", "#FF9400", "#FF3100"))(256)\n') f.write('dd$col <- cols[dd$dens]\n') f.write('plot(mad_vs_median ~ nz_gene_median3,data=dd[order(dd$dens),], col=col, pch=20,xlab="Gene expression (median RPM log2)",ylab="Median absolute deviation/median")\n') f.write('dev.state = dev.off()\n') #f.write('destfile = "'+smp_corr_plot_file2+'"\n') #f.write("M.pc<-prcomp(t(MM_s))\n") #f.write("png(destfile,width=500,height=500,units='px')\n") #f.write("plot(M.pc)\n") #f.write("dev.state = dev.off()\n") f.write('destfile = "'+smp_cov_plot_file+'"\n') f.write("png(destfile,width=500,height=500,units='px')\n") f.write('xname=c("<0.5","0.5-10","10-100",">=100")\n') f.write('Fn_mm = matrix(0,nrow=length(xname),ncol=length(M1[1,]))\n') f.write('rownames(Fn_mm) = xname \n') f.write('colnames(Fn_mm) = ' + header_corr + ' \n') f.write('for(i in 1:length(M1[1,])) { \n') f.write('Fn_mm[1,i] = length(which(M1[,i]<0.5)) \n') f.write('Fn_mm[2,i] = length(which(M1[,i]>=0.5 & M1[,i]<10))\n') f.write('Fn_mm[3,i] = length(which(M1[,i]>=10 & M1[,i]<100))\n') f.write('Fn_mm[4,i] = length(which(M1[,i]>=100)) }\n') f.write('barplot(Fn_mm,main="Gene abundance (RPM)",xlab="Sample",ylab="Frequency",col=c("green","blue","red","yellow"),legend=xname,las=2)\n') f.write("dev.state = dev.off()\n") if pe_smp_cnt > 0 : f.write('srcfiles2 = '+filenames_insert+'\n') f.write('destfile2 = "'+smp_inner_plot_file+'"\n') f.write("png(destfile2,width=500,height=500,units='px')\n") f.write('f = read.delim(srcfiles2[1],header=T)\n') f.write('freq=rep(round((f[,1]+f[,2]+1)/2,0),times=f[,3])\n') f.write('smp ='+header_insert+'\n') f.write('boxplot(freq,outline=F,xlim=c(0,length(smp)+1),ylab="Inner distance (bp)",col="blue",border="black") \n') f.write('for (i in 2:length(srcfiles2)){ \n') f.write(' f = read.delim(srcfiles2[i],header=T)\n') f.write('freq=rep(round((f[,1]+f[,2]+1)/2,0),times=f[,3])\n') f.write('boxplot(freq,add=T,outline=F,at=i,col="blue",border="black") }\n') f.write('axis(1,at=seq(1,length(smp),by=1),labels=smp,las=2)\n') f.write("dev.state = dev.off()\n") f.write('destfile3 = "'+smp_quality_plot_file+'"\n') f.write('srcfiles3 = '+filenames_mapq+'\n') f.write("png(destfile3,width=500,height=500,units='px')\n") f.write('xname=c("<3","3-10","10-20","20-30",">=30")\n') f.write('Fn_mm = matrix(0,nrow=length(xname),ncol=length(srcfiles3))\n') f.write('rownames(Fn_mm) = xname \n') f.write('colnames(Fn_mm) = ' + header_mapq + ' \n') f.write('for(i in 1:length(srcfiles3)) { \n') f.write(' f = read.delim(srcfiles3[i],header=T)\n') f.write(' if(length(which(f[,1]<3)) >0){ Fn_mm[1,i] = sum(f[which(f[,1]<3),3])/f[1,2]} \n') f.write('if(length(which(f[,1]>=3 & f[,1]<10)) >0) {Fn_mm[2,i] = sum(f[which(f[,1]<10 & f[,1]>=3),3])/f[1,2]} \n') f.write('if(length(which(f[,1]>=10 & f[,1]<20)) >0) {Fn_mm[3,i] = sum(f[which(f[,1]<20 & f[,1]>=10),3])/f[1,2] }\n') f.write('if(length(which(f[,1]>=20 & f[,1]<30)) >0) {Fn_mm[4,i] = sum(f[which(f[,1]<30 & f[,1]>=20),3])/f[1,2]} \n') f.write('if(length(which(f[,1]>=30)) >0) {Fn_mm[5,i] = sum(f[which(f[,1]>=30),3])/f[1,2] }} \n') f.write('barplot(Fn_mm,xlab="Sample",main="Mapping Quality",ylim=c(0,1),ylab="Frequency",col=c("blue","green","yellow","orange","red"),legend=xname,las=2)\n') f.write("dev.state = dev.off()\n") #f.write('destfile3 = "'+smp_dup_plot_file+'"\n') #f.write('srcfiles3 = '+filenames_dup_pos+'\n') #f.write('srcfiles4 = '+filenames_dup_seq+'\n') #f.write('png(destfile3,width=500,height=500,units="px")\n') #f.write('M = matrix(0,nrow=2,ncol=length(srcfiles3))\n') #f.write('colnames(M) = ' + header_dup + ' \n') #f.write('for(i in 1:length(srcfiles3)) { \n') #f.write('f_pos = read.delim(srcfiles3[i],header=T) \n') #f.write('f_seq = read.delim(srcfiles4[i],header=T) \n') #f.write('total = sum(f_pos[,1]*f_pos[,2]) \n') #f.write('pos_dedup = round(sum(f_pos[,2])/total*100,2) \n') #f.write('seq_dedup = round(sum(f_seq[,2])/total*100,2) \n') #f.write('M[1,i] = pos_dedup \n') #f.write('M[2,i] = seq_dedup }\n') #f.write('rownames(M)<-c("Position","Sequence")\n') #f.write('barplot(M,ylim=c(0,100),beside=T,col=c("blue","red"),main="Duplication",xlab="Sample",ylab="Percentage after deduplication",legend=c("Mapping position","Sequence"),args.legend=list(bty="n"))\n') #f.write('dev.state = dev.off()\n') f.close() subprocess.call("Rscript " + smp_corr_r , shell=True) except: sys.stderr.write("Error in computing sample correlation.\n") smp_corr_plot_file = "" smp_corr_plot_file2 = "" pass info("*** Correlation completed ***\n") #outputToHTML(smp_res,args.labels,smp_corr_plot_file,smp_inner_plot_file, smp_quality_plot_file,smp_cov_plot_file,html_file) outputToHTML(smp_res,args.labels,html_file) info("*** BAM QC run completed. ***\n") def create_per_sample_plot(ifile,label,cur_data_dir,cur_fig_dir,res): output_prefix_data = cur_data_dir + label output_prefix_fig = cur_fig_dir + label try: subprocess.call("Rscript "+output_prefix_data+'.read_distr.r',shell=True) except : print("Error in plotting read distributions.\n") res.read_dist_plot_file1 = "" pass try: subprocess.call("Rscript "+output_prefix_data+'.read_distr_pie.r',shell=True) except : print("Error in plotting read distributions.\n") res.read_dist_plot_file2 = "" pass #res.filename = ifile if os.path.isfile(output_prefix_data+'.clipping_profile.r') : try: subprocess.call("Rscript " + output_prefix_data + '.clipping_profile.r',shell=True) # subprocess.call("rm -rf "+ output_prefix + '.clipping_profile.r',shell=True) except: print("Cannot generate png file form " + output_prefix_data + '.clipping_profile.r\n') res.clipping_plot_file = "" pass else : res.no_clipping = True if os.path.isfile(output_prefix_data+'.mapq_profile.r') : try: subprocess.call("Rscript " + output_prefix_data + '.mapq_profile.r',shell=True) except: print("Cannot generate png file form " + output_prefix_data + '.mapq_profile.r\n') res.mapq_plot_file = "" pass else : res.mapq_plot_file = "" if os.path.isfile(output_prefix_data+'.geneBodyCoverage_plot.r') : try: subprocess.call("Rscript " + output_prefix_data + '.geneBodyCoverage_plot.r',shell=True) except: print("Cannot generate png file from " + output_prefix_data + '.geneBodyCoverage_plot.r\n') res.read_cov_plot_file = "" pass try: subprocess.call("Rscript " + output_prefix_data + '.TransCoverage.r',shell=True) except: print("Cannot generate png file from " + output_prefix_data + '.TransCoverage.r\n') res.trans_cov_plot_file = "" pass try: subprocess.call("Rscript " + output_prefix_data + ".ReadLen_plot.r", shell=True) except: print("Cannot generate png file form " + output_prefix_data + '.ReadLen_plot.rn') pass if res.is_pairEnd and os.path.isfile(output_prefix_data+'.inner_distance_plot.r') : try: subprocess.call("Rscript " + output_prefix_data + '.inner_distance_plot.r',shell=True) except: print("Cannot generate png file form " + output_prefix_data + '.inner_distance_plot.r\n') res.insert_plot_file = "" pass else : res.insert_plot_file = "" #def outputToHTML(res_list,smps,corr_plot_file,smp_inner_plot_file,smp_quality_plot_file,smp_cov_plot_file,html_file): def outputToHTML(res_list,smps,html_file): #smps = res_list.keys() tohtml = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN">\n' tohtml += '<html>\n' tohtml += '<head><title>ezBAMQC Report</title>\n' tohtml += '<style type="text/css">\n' #/*ul Styles*/ tohtml += 'html,body{margin:0;padding:0;height:100%;width:1500px}\n' tohtml += 'div#header{background-color:#F3F2ED;}\n' tohtml += 'div#header h1{text-align:center; height:80px;line-height:80px;margin:0;padding-left:10px;}\n' tohtml += 'div#container{text-align:left;height:100%;width:1500px}\n' tohtml += 'div#navigation{background:#F6F0E0;}\n' tohtml += 'div#navigation{float:left;width:200px;height:100%}\n' tohtml += '.menu-item ul { \n' tohtml += 'background: #F6F0E0; \n' tohtml += 'font-size: 13px; \n' tohtml += 'line-height: 30px; \n' tohtml += 'height: 0px; \n' tohtml += 'list-style-type: none;' tohtml += 'overflow: hidden; \n' tohtml += 'padding: 0px; }\n' tohtml += '.menu-item:hover ul { height: 220px; }\n ' #/* table * tohtml += 'table{ margin:0;padding:0;width:1300px;table-layout:fixed;text-align:left; }\n' tohtml += 'table > thead > tr.tableizer-firstrow > th { padding: 10px; background: lavenderblush;} \n' #/*border: 4px solid #fff;*/ /*text-overflow: ellipsis;*/ /*overflow: hidden;*/ tohtml += 'table > tbody > tr > td{ padding: 10px; background: #f8f8f8; word-wrap: break word; } \n' tohtml += 'div#footer{background:#BFBD93;}\n' tohtml += 'div#footer p{margin:0;padding:5px 10px}\n' tohtml += 'div#footer{clear:both;width:100%;text-align:center}\n' tohtml += 'div#main{float:right;width:1300px}\n' tohtml += 'a{text-decoration:none; color:#000000;}\n' tohtml += 'a:hover {text-decoration: underline; }\n' tohtml += '</style> </head>\n' tohtml += '<body>\n' tohtml += '<div id ="container">\n' tohtml += '<div id="header"><h1>ezBAMQC Report</h1><p text-align="left">Created On: '+datetime.now().strftime('%m-%d-%Y')+'</p></div>\n' i = 0 tohtml += '<div id="wrapper">\n' tohtml += '<div class="summary" id="navigation">\n' #tohtml += '<div id="update_time">\n' #tohtml += #tohtml += '</div>\n' tohtml += '<h2>Summary</h2>\n' tohtml += '<ul>\n' #i = 0 for i in range(len(smps)) : key = smps[i] tohtml += '<li>' tohtml += '<div class="menu-item">\n' tohtml += '<h4>'+key+'</h4>\n' tohtml += '<ul>\n' tohtml += '<li><a href="#M'+str(i)+'0">Basic Statistics</a></li>\n' tohtml += '<li><a href="#M'+str(i)+'1">Read Distribution</a></li>\n' tohtml += '<li><a href="#M'+str(i)+'2">Mappability</a></li>\n' tohtml += '<li><a href="#M'+str(i)+'3">Coverage</a></li>\n' tohtml += '<li><a href="#M'+str(i)+'4">Read Length and Insertion Size</a></li>\n' #tohtml += '<li><a href="#M'+str(i)+'6">rRNA contamination</a></li>\n' tohtml += '</ul></div></li>\n' #i += 1 smp_corr_pos = len(smps)*5 + 100 if len(res_list) > 1: tohtml += '<li><div class="menu-item"><a href="#M'+str(smp_corr_pos)+'"><h4>Sample Correlation</h4></a></div></li>\n' tohtml += '</ul>\n' tohtml += '</div>\n' tohtml += '<div id="main" >\n' for i in range(len(smps)): key = smps[i] res = res_list[key] tohtml += '<h2>'+key+'</h2>\n' tohtml += '<div class="module"><h2 id="M'+str(i)+'0">Basic Statistics</h2>\n' tohtml += '<table>\n' tohtml += '<thead><tr class=\"tableizer-firstrow\">\n'; tohtml += '<th>Measure</th><th>Value</th></tr></thead>\n' tohtml += '<tbody><tr><td>Total Reads</td><td>'+str(res.total_reads)+'</td></tr>\n' if not res.is_pairEnd : tohtml += '<tr><td>Unique Reads</td><td>'+str(res.uniq_mapped_reads)+'</td></tr>\n' tohtml += '<tr><td>Multi-reads</td><td>'+str(res.multi_mapped_reads)+'</td></tr>\n' tohtml += '<tr><td>Unmapped Reads</td><td>'+str(res.unmapped_reads)+'</td></tr>\n' tohtml += '<tr><td>Low Quality Reads</td><td>'+str(res.low_qual)+'</td></tr>\n' tohtml += '<tr><td>Forward Reads</td><td>'+str(res.forward_read)+'</td></tr>\n' tohtml += '<tr><td>Reverse Reads</td><td>'+str(res.reverse_read)+'</td></tr>\n' tohtml += '<tr><td>Splice Reads</td><td>'+str(res.splice)+'</td></tr>\n' tohtml += '<tr><td>Non-Splice Reads</td><td>'+str(res.noSplice)+'</td></tr>\n' tohtml += '<tr><td>rRNA Reads</td><td>'+str(res.rRNA_read)+'</td></tr></tbody></table></div>\n' else : # paired read tohtml += '<tr><td>Uniquely Mapped Pairs</td><td>'+str(res.paired_reads)+'</td></tr>\n' tohtml += '<tr><td>Uniquely Mapped Read1</td><td>'+str(res.mapped_read1)+'</td></tr>\n' tohtml += '<tr><td>Uiquely Mapped Read2</td><td>'+str(res.mapped_read2)+'</td></tr>\n' tohtml += '<tr><td>Multi-reads</td><td>'+str(res.multi_mapped_reads)+'</td></tr>\n' tohtml += '<tr><td>Unmapped Read1</td><td>'+str(res.unmapped_read1)+'</td></tr>\n' tohtml += '<tr><td>Unmapped Read2</td><td>'+str(res.unmapped_read2)+'</td></tr>\n' tohtml += '<tr><td>Number of read mapped "+,-" </td><td>'+str(res.mapped_plus_minus)+'</td></tr>\n' tohtml += '<tr><td>Number of read mapped "+,+" </td><td>'+str(res.mapped_plus_plus)+'</td></tr>\n' tohtml += '<tr><td>Number of read mapped "-,+" </td><td>'+str(res.mapped_minus_plus)+'</td></tr>\n' tohtml += '<tr><td>Number of read mapped "-,-" </td><td>'+str(res.mapped_minus_minus)+'</td></tr>\n' tohtml += '<tr><td>Low Quality Read1</td><td>'+str(res.low_qual_read1)+'</td></tr>\n' tohtml += '<tr><td>Low Quality Read2</td><td>'+str(res.low_qual_read2)+'</td></tr>\n' tohtml += '<tr><td>Forward Reads</td><td>'+str(res.forward_read)+'</td></tr>\n' tohtml += '<tr><td>Reverse Reads</td><td>'+str(res.reverse_read)+'</td></tr>\n' tohtml += '<tr><td>Splice Reads</td><td>'+str(res.splice)+'</td></tr>\n' tohtml += '<tr><td>Non-Splice Reads</td><td>'+str(res.noSplice)+'</td></tr>\n' tohtml += '<tr><td>Pairs mapped to different chromosomes</td><td>'+str(res.paired_diff_chrom)+'</td></tr>\n' tohtml += '<tr><td>rRNA Reads</td><td>'+str(res.rRNA_read)+'</td></tr></tbody></table></div>\n' tohtml += '<div class="module"><h2 id="M'+str(i)+'1">Read Distribution</h2>\n' tohtml += '<p><img class="indented" src="./figs/'+os.path.basename(res.read_dist_plot_file1)+'" alt="Read Distribution"><img class="indented" src="./figs/'+os.path.basename(res.read_dist_plot_file2)+'" alt="Read Distribution"></p></div>\n' if res.no_clipping : tohtml += '<div class="module"><h2 id="M'+str(i)+'2">Mappability Profile</h2>\n' tohtml += '<p>There is no soft clipping. <img class="indented" src="./figs/'+os.path.basename(res.mapq_plot_file)+'" alt="MapQ Profile"> </p></div>\n' else : tohtml += '<div class="module"><h2 id="M'+str(i)+'2">Mappability</h2>\n' tohtml += '<p><img class="indented" src="./figs/'+os.path.basename(res.clipping_plot_file)+'" alt="Mappablity Profile"> <img class="indented" src="./figs/'+os.path.basename(res.mapq_plot_file)+'" alt="MapQ Profile"></p></div>\n' tohtml += '<div class="module"><h2 id="M'+str(i)+'3">Coverage</h2>\n' tohtml += '<p><img class="indented" src="./figs/'+os.path.basename(res.read_cov_plot_file)+'" alt="Read Coverage"> <img class="indented" src="./figs/'+os.path.basename(res.trans_cov_plot_file)+'" alt="Read Coverage"></p></div>\n' if res.is_pairEnd : tohtml += '<div class="module"><h2 id="M'+str(i)+'4">Read Length and Insertion Size</h2>\n' tohtml += '<p><img class="indented" src="./figs/'+os.path.basename(res.readLen_plot_file)+'" alt="Read Length"> <img class="indented" src="./figs/'+os.path.basename(res.insert_plot_file)+'" alt="Insertion Size"></p></div>\n' else : tohtml += '<div class="module"><h2 id="M'+str(i)+'4">Read Length</h2>\n' tohtml += '<p><img class="indented" src="./figs/'+os.path.basename(res.readLen_plot_file)+'" alt="Read Length"></p></div>\n' if len(res_list) >1 : tohtml += '<div class="Smp_corr"><h2 id="M'+str(smp_corr_pos)+'">Sample Correlation and Quality</h2>\n' tohtml += '<p><img class="indented" src="./figs/smp_corr.png" alt="Sample Correlation"><img class="indented" src="./figs/smp_qual.png" alt="Sample Correlation"></p>\n' if res.is_pairEnd : tohtml += '<h2 id="M'+str(smp_corr_pos)+'">Sample Coverage and Insert size</h2>\n' tohtml += '<p><img class="indented" src="./figs/smp_cov.png" alt="Sample Coverage"><img class="indented" src="./figs/smp_inner.png" alt="Sample insert size"></p></div>\n' else : tohtml += '<h2 id="M'+str(smp_corr_pos)+'">Sample Coverage</h2>\n' tohtml += '<p><img class="indented" src="./figs/smp_cov.png" alt="Sample Coverage"></p></div>\n' tohtml += '<h2 id="M'+str(smp_corr_pos)+'">Sample Variation</h2>\n' tohtml += '<p><img class="indented" src="./figs/smp_reproducibility.png" alt="Sample Variation"><img class="indented" src="./figs/smp_var.png" alt="Sample Variation"></p></div>\n' tohtml += '</div>\n' tohtml += '<div id="footer"><p>Produced by Bioinformatics Shared Resource at CSHL </p></div></div></div></body></html>\n' try : f = open(html_file,'w') f.write(tohtml+"\n") f.close() except : sys.stderr.write("Cannot generate the final report. \n") sys.exit(1) def prepare_parser (): """ inputs(parameters) required/allowed in this pipeline """ desc = "Quality control of mapped NGS data (BAM/SAM files) ." exmp = "Example: ezBAMQC -r mm9_refGene.gtf -i treat1.bam treat2.bam treat3.bam -q 30 --rRNA mm9_rRNA.bed -o bamqc_out" parser = argparse.ArgumentParser(description = desc,epilog = exmp) parser.add_argument('-i', '--inputFile', metavar = 'alignment_files', dest = 'ifiles', nargs = '+', required = True, help = 'Alignment files. Could be multiple SAM/BAM files separated by space. Required.') parser.add_argument('-r', '--refgene', metavar = 'refgene', dest='ref_gene_model', nargs = '?', type=str, required = True,help = 'refGene GTF file. Required') parser.add_argument('-f', metavar='attrID', dest='attrID', nargs='?', type=str, default="gene_id", help='The read summation at which feature level in the GTF file. DEFAULT: gene_id.') parser.add_argument('--rRNA', metavar = 'rRNA', dest='rRNA_model', nargs = '?', type=str, default="", help = 'rRNA BED file.') parser.add_argument('-o', '--outputDir', metavar = 'dir', dest='dir', nargs = '?', type=str, required = True, help = 'output directory. Required.') # parser.add_argument('-i', '--index', metavar = 'transript_Index', dest='trIdx', nargs = '?', # help = 'Transcriptome index file.') parser.add_argument('--stranded', metavar='stranded', dest='stranded', nargs='?', type=str, default="yes", choices=['yes','no','reverse'], help='Is this a stranded library? (yes, no, or reverse). DEFAULT: yes.') parser.add_argument('-q', '--mapq', metavar = 'mapq', dest='mapq', nargs = '?', default=30, type=int, help = 'Minimum mapping quality (phred scaled) for an alignment to be called uniquely mapped. DEFAULT:30') #parser.add_argument('-l', '--lowBound', metavar = 'lb', dest='lb', nargs = '?', default=-250, type=int, # help = 'Lower bound for plotting insert size distribution. DEFAULT:-250') #parser.add_argument('-u', '--upperBound', metavar = 'ub', dest='ub', nargs = '?', default=250, type=int, # help = 'Upper bound for plotting insert size distribution. DEFAULT:250') #parser.add_argument('-s', '--stepSize', metavar = 'stepsize', dest='step_size', nargs = '?', default=5, type=int, # help = 'Step size for plotting insert size distribution. DEFAULT:5') parser.add_argument('-l','--label',metavar = 'labels', dest = 'labels', nargs = '+', help = 'Labels of input files. DEFAULT:smp1 smp2 ...') #parser.add_argument('-p', '--processes', dest='numProc', default=1, type=int,help='Number of processes to use .DEFAULT:1') parser.add_argument('-t', '--threads', dest='numThreads', default=1, type=int,help='Number of threads to use .DEFAULT:1') return parser def read_opts(parser): ''' object parser contains parsed options ''' args = parser.parse_args() args.numProc = 1 # logging object logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) #treatment files if args.labels is not None : if len(args.labels) >0 and len(args.ifiles) != len(args.labels) : logging.error("Number of labels does not match with the number of samples.\n") sys.exit(1) if args.labels is None : args.labels = [] for i in range(len(args.ifiles)) : if not os.path.isfile(args.ifiles[i]) : logging.error("No such file: %s !\n" % (args.ifiles[i])) sys.exit(1) if len(args.labels) < len(args.ifiles) : args.labels.append("smp"+str(i)) # if args.trIdx is None : # logging.warning("Trancsriptome index file is not available.\n") # # else : # if not os.path.isfile(args.trIdx) : # logging.error("No such file : %s !\n" %(args.trIdx)) # sys.exit(1) if args.stranded not in ['yes', 'no', 'reverse'] : logging.error("Does not support such stranded value: %s !\n" % (args.stranded)) sys.exit(1) if args.mapq is None : args.mapq = 30 if args.rRNA_model is not None and args.rRNA_model != "": if not os.path.isfile(args.rRNA_model) : logging.error("No such file : %s \n" %(args.rRNA_model)) sys.exit(1) if args.rRNA_model is None : args.rRNA_model = "" if args.attrID is None or args.attrID == "": logging.error("please specify the read summation at which feature level in the GTF file\n") sys.exit(1) if args.ref_gene_model is None : logging.error("reference gene model is required.\n") sys.exit(1) else : if not os.path.isfile(args.ref_gene_model) : logging.error("No such file : %s !\n" %(args.ref_gene_model)) sys.exit(1) # logging alias args.critical = logging.critical args.error = logging.error args.warn = logging.warning args.debug = logging.debug args.info = logging.info return args if __name__ == '__main__': try: start_time = time.time() main() end_time = time.time() sys.stderr.write("Elapsed time was " + str(round((end_time - start_time) / 60, 2)) + " minutes.\n") except KeyboardInterrupt: sys.stderr.write("User interrupt !\n") sys.exit(0)