# HG changeset patch # User xuebing # Date 1332270900 14400 # Node ID a861f40db890a6ca558e8407aa54c6c874045386 # Parent 1a177d17b27df9fb67151377329d156847b916f2 Uploaded diff -r 1a177d17b27d -r a861f40db890 alignr.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignr.py Tue Mar 20 15:15:00 2012 -0400 @@ -0,0 +1,353 @@ +''' +the scripts takes two files as input, and compute the coverage of +features in input 1 across features in input 2. Features in input 2 are +divided into bins and coverage is computed for each bin. + +please check the help information by typing: + + python align.py -h + + +requirement: + please install the following tools first: + bedtools: for read/region overlapping, http://code.google.com/p/bedtools/ + +''' + +import os,sys,os.path +from optparse import OptionParser + +def lineCount(filename): + with open(filename) as f: + for i, l in enumerate(f): + pass + return i + 1 + +def combineFilename(f1,f2): + ''' + fuse two file names into one + ''' + return f1.split('/')[-1]+'-'+f2.split('/')[-1] + +def checkFormat(filename1,filename2,input1format): + ''' + check the format of input files + ''' + + # file1 + # read the first line, see how many filds + ncol1 = 6 + if input1format == "BED": + f = open(filename1) + line = f.readline().strip().split('\t') + ncol1 = len(line) + if ncol1 < 3: + print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." + sys.exit(1) + f.close() + + # file2 + f = open(filename2) + line = f.readline().strip().split('\t') + ncol2 = len(line) + if ncol2 < 3: + print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." + sys.exit(1) + + return ncol1,ncol2 + + +def makeBed(filename,ncol): + ''' + add up to 6 column + ''' + f = open(filename) + outfile = filename+'.tmp.bed' + outf = open(outfile,'w') + if ncol == 3: + for line in f: + outf.write(line.strip()+'\t.\t0\t+\n') + elif ncol == 4: + for line in f: + outf.write(line.strip()+'\t0\t+\n') + if ncol == 5: + for line in f: + outf.write(line.strip()+'\t+\n') + f.close() + outf.close() + return outfile + +def makeWindow(filename,window): + + outfile = filename+'-window='+str(window)+'.tmp.bed' + if not os.path.exists(outfile): + f=open(filename) + out = open(outfile,'w') + lines = f.readlines() + if 'track' in lines[0]: + del lines[0] + for line in lines: + flds = line.strip().split('\t') + + #new position + center = (int(flds[1]) + int(flds[2]))/2 + start = center - window + end = center + window + if start >= 0: + flds[1] = str(start) + flds[2] = str(end) + out.write('\t'.join(flds)+'\n') + f.close() + out.close() + return outfile + +def groupReadsMapped2aRegion(filename,ncol): + ''' + read output from intersectBED + find all reads mapped to each region + ''' + try: + f=open(filename) + #If filename cannot be opened, print an error message and exit + except IOError: + print "could not open",filename,"Are you sure this file exists?" + sys.exit(1) + lines = f.readlines() + + allReadsStart = {} + allReadsEnd = {} + regionStrand = {} + regionStart = {} + regionEnd = {} + + for line in lines: + flds = line.strip().split('\t') + key = '_'.join(flds[ncol:(ncol+4)]) + if not allReadsStart.has_key(key): + allReadsStart[key] = list() + allReadsEnd[key] = list() + #print flds[ncol+0],flds[ncol+1],flds[ncol+2] + allReadsStart[key].append(int(flds[1])) + allReadsEnd[key].append(int(flds[2])) + regionStrand[key] = flds[ncol+5] + regionStart[key] = int(flds[ncol+1]) + regionEnd[key] = int(flds[ncol+2]) + return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd) + + +def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins): + ''' + each region is divided into nbins + compute the number of reads covering each bin for each region + ''' + RegionProfile = {} + nRead = {} # num of all reads in the region + for region in allReadsStart.keys(): + RegionProfile[region] = [0]*nbins + nRead[region] = len(allReadsStart[region]) + #print region,nRead[region],allReadsStart[region] + for i in range(nRead[region]): + RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins) + return RegionProfile,nRead + +def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins): + ''' + each region is divided into nbins, + add 1 to each bin covered by the read + ''' + L = regionEnd-regionStart + start = int(nbins*(readStart-regionStart)/L) + end = int(nbins*(readEnd-regionStart)/L) + if start < 0: + start = 0 + if end > nbins: + end = nbins + if strand == '-': + for i in range(start,end): + RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1 + else: # if the 6th column of the input is not strand, will treat as + strand by default + for i in range(start,end): + RegionCount[i] = RegionCount[i] + 1 + return RegionCount + +def saveProfile(filename,Profile,nRegion): + out = open(filename,'w') + for regionType in Profile.keys(): + #print Profile[regionType] + out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n') + +def saveSummary(filename,Profile,nbin): + out = open(filename+'.summary','w') + + nfeat = len(Profile) + summaryprofile = [0]*nbin + for regionType in Profile.keys(): + for i in range(nbin): + summaryprofile[i] += Profile[regionType][i] + out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n') + out.close() + # calculate standard error + out = open(filename+'.standarderror','w') + sd = [0.0]*nbin + u = [0.0]*nbin + for i in range(nbin): + u[i] = float(summaryprofile[i])/nfeat + for regionType in Profile.keys(): + sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2 + sd[i] = sd[i]**0.5 / nfeat + out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n') + out.close() + +def main(): + usage = "usage: %prog [options] -a inputA -b inputB" + parser = OptionParser(usage) + parser.add_option("-a", dest="inputA", + help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" ) + parser.add_option("-b",dest="inputB", + help="(required) input file B, interval file" ) + parser.add_option("-f",dest="aformat",default="BED", + help="Format of input file A. Can be BED (default) or BAM") + parser.add_option("-w",type='int',dest="window", + help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" ) + parser.add_option("-n", type="int", dest="nbins",default=100, + help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100") + parser.add_option("-s",action="store_true", dest="strandness", + help="enforce strandness: require overlapping on the same strand. Default is off") + parser.add_option("-p",action="store_true", dest="plot",default=False, + help="load existed intersectBed outputfile") + parser.add_option("-q", action="store_true", dest="quiet",default=False, + help="suppress output on screen") + parser.add_option("-o", dest="output_data", + help="(optional) output coverage file (txt) name." ) + parser.add_option("-v", dest="output_plot", + help="(optional) output plot (pdf) file name." ) + parser.add_option("-l", dest="plot_title", default="", + help="(optional) output title of the plot." ) + parser.add_option("--ylim", dest="ylim", default="min,max", + help="(optional) ylim of the plot" ) + parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False, + help="save profile summary only (no data for individual features)") + parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False, + help="compute and plot standard deviation for each bin. used when --summary-only is on") + parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False, + help="save profile only (no plot)") + parser.add_option("--span", type="float", dest="span",default=0.1, + help="loess span smooth parameter, 0.1 ~ 1") + + (options, args) = parser.parse_args() + + if options.inputA == None or options.inputB == None: + parser.error("Please specify two input files!!") + + if not options.quiet: + print "checking input file format..." + + ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat) + + if ncol2 < 6: + options.inputB = makeBed(options.inputB,ncol2) + if not options.quiet: + print "fill up 6 columns" + + if options.window > 0: + if not options.quiet: + print "making windows from "+options.inputB+"..." + options.inputB = makeWindow(options.inputB,options.window) + + output = combineFilename(str(options.inputA),str(options.inputB)) + + if not options.plot: + if options.aformat == "BAM": + cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split ' + else: + cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB) + if options.strandness: + cmd = cmd + ' -s' + cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed' + if not options.quiet: + print "search for overlappings: "+cmd + status = os.system(cmd) + if status != 0: + sys.exit(1) + + + if not options.quiet: + print 'group reads mapped to the same region...' + + allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol) + + if len(allReadsStart) == 0: + if not options.quiet: + print 'no overlap found!!' + os.system('rm *tmp.*') + sys.exit(1) + + if not options.quiet: + print 'count number of reads mapped to each bin...' + + RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) + + if options.output_data == None: + options.output_data = output+'.txt' + + if options.summary_only: + saveSummary(options.output_data,RegionProfile,options.nbins) + + else: + saveProfile(options.output_data,RegionProfile,nRead) + + if not options.quiet: + print 'results saved to: '+ options.output_data + + if not (options.summary_only or options.profile_only ): + # visualize + + if options.window < 1: + xlab = 'relative position (bins)' + else: + xlab = 'relative position (bp)' + + if options.output_plot == None: + options.output_plot = output+'.pdf' + + title = options.plot_title+'\n n = '+str(len(RegionProfile)) + + rscript = open("tmp.r","w") + rscript.write("x <- read.table('"+options.output_data+"')\n") + rscript.write("pdf('"+options.output_plot+"')\n") + rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n") + rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n") + + if options.window == 0: + rscript.write("xticks <- seq("+str(options.nbins)+")\n") + else: + rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n") + + if options.ylim != 'min,max': + rscript.write("ylim=c("+options.ylim+")\n") + else: + rscript.write("ylim=c(min(avg-err),max(avg+err))\n") + rscript.write("par(cex=1.5)\n") + #smooth + if options.span >= 0.1: + rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n") + rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n") + rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n") + rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n") + rscript.write("lines(xticks,avg,type='l',lwd=1)\n") + #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n") + #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n") + #rscript.write("lines(xticks,avg,lwd=2)\n") + #rscript.write("lines(xticks,avg-err,col='green')\n") + #rscript.write("lines(xticks,avg+err,col='green')\n") + rscript.write("dev.off()\n") + rscript.close() + + os.system("R --vanilla < tmp.r") + + # remove intermediate output + os.system('rm *tmp.*') + + +if __name__ == "__main__": + main()