| Previous changeset 1:1a177d17b27d (2012-03-20) Next changeset 3:b11a21c704ec (2012-03-20) |
|
Commit message:
Uploaded |
|
added:
alignr.py |
| b |
| diff -r 1a177d17b27d -r a861f40db890 alignr.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignr.py Tue Mar 20 15:15:00 2012 -0400 |
| [ |
| b'@@ -0,0 +1,353 @@\n+\'\'\'\n+the scripts takes two files as input, and compute the coverage of \n+features in input 1 across features in input 2. Features in input 2 are \n+divided into bins and coverage is computed for each bin. \n+\n+please check the help information by typing:\n+\n+ python align.py -h\n+\n+\n+requirement:\n+ please install the following tools first:\n+ bedtools: for read/region overlapping, http://code.google.com/p/bedtools/\n+ \n+\'\'\'\n+\n+import os,sys,os.path\n+from optparse import OptionParser\n+\n+def lineCount(filename):\n+ with open(filename) as f:\n+ for i, l in enumerate(f):\n+ pass\n+ return i + 1\n+\n+def combineFilename(f1,f2):\n+ \'\'\'\n+ fuse two file names into one\n+ \'\'\'\n+ return f1.split(\'/\')[-1]+\'-\'+f2.split(\'/\')[-1]\n+\n+def checkFormat(filename1,filename2,input1format):\n+ \'\'\'\n+ check the format of input files\n+ \'\'\'\n+\n+ # file1\n+ # read the first line, see how many filds\n+ ncol1 = 6\n+ if input1format == "BED":\n+ f = open(filename1)\n+ line = f.readline().strip().split(\'\\t\')\n+ ncol1 = len(line)\n+ if ncol1 < 3:\n+ print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."\n+ sys.exit(1)\n+ f.close()\n+ \n+ # file2\n+ f = open(filename2)\n+ line = f.readline().strip().split(\'\\t\')\n+ ncol2 = len(line) \n+ if ncol2 < 3:\n+ print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."\n+ sys.exit(1) \n+\n+ return ncol1,ncol2\n+\n+\n+def makeBed(filename,ncol):\n+ \'\'\'\n+ add up to 6 column\n+ \'\'\'\n+ f = open(filename)\n+ outfile = filename+\'.tmp.bed\'\n+ outf = open(outfile,\'w\')\n+ if ncol == 3:\n+ for line in f:\n+ outf.write(line.strip()+\'\\t.\\t0\\t+\\n\')\n+ elif ncol == 4:\n+ for line in f:\n+ outf.write(line.strip()+\'\\t0\\t+\\n\')\n+ if ncol == 5:\n+ for line in f:\n+ outf.write(line.strip()+\'\\t+\\n\')\n+ f.close()\n+ outf.close()\n+ return outfile\n+ \n+def makeWindow(filename,window):\n+\n+ outfile = filename+\'-window=\'+str(window)+\'.tmp.bed\'\n+ if not os.path.exists(outfile):\n+ f=open(filename)\n+ out = open(outfile,\'w\')\n+ lines = f.readlines()\n+ if \'track\' in lines[0]:\n+ del lines[0]\n+ for line in lines:\n+ flds = line.strip().split(\'\\t\')\n+\n+ #new position\n+ center = (int(flds[1]) + int(flds[2]))/2\n+ start = center - window\n+ end = center + window\n+ if start >= 0:\n+ flds[1] = str(start)\n+ flds[2] = str(end)\n+ out.write(\'\\t\'.join(flds)+\'\\n\')\n+ f.close()\n+ out.close()\n+ return outfile\n+\n+def groupReadsMapped2aRegion(filename,ncol):\n+ \'\'\'\n+ read output from intersectBED\n+ find all reads mapped to each region\n+ \'\'\'\n+ try:\n+ f=open(filename)\n+ #If filename cannot be opened, print an error message and exit\n+ except IOError:\n+ print "could not open",filename,"Are you sure this file exists?"\n+ sys.exit(1)\n+ lines = f.readlines()\n+ \n+ allReadsStart = {}\n+ allReadsEnd = {}\n+ regionStrand = {}\n+ regionStart = {}\n+ regionEnd = {}\n+ \n+ for line in lines:\n+ flds = line.strip().split(\'\\t\')\n+ key = \'_\'.join(flds[ncol:(ncol+4)])\n+ if not allReadsStart.has_key(key):\n+ allReadsStart[key] = list()\n+ allReadsEnd[key] = list()\n+ #print flds[ncol+0],flds[ncol+1],flds[ncol+2]\n+ allReadsStart[key].append(int(flds[1]))\n+ allReadsEnd[key].append(int(flds[2]))\n+ regionStrand[key] = flds[ncol+5] \n+ regionStart[key] = int(flds[ncol+1]) \n+ regionEnd[key] = int(flds[ncol+2]) \n+ return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)\n+ \n+'..b'ions.inputB))\n+ \n+ if not options.plot:\n+ if options.aformat == "BAM":\n+ cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + \' -bed -split \'\n+ else:\n+ cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)\n+ if options.strandness:\n+ cmd = cmd + \' -s\'\n+ cmd = cmd +" -wo > "+ output+\'-intersect.tmp.bed\'\n+ if not options.quiet:\n+ print "search for overlappings: "+cmd\n+ status = os.system(cmd)\n+ if status != 0:\n+ sys.exit(1)\n+\n+ \n+ if not options.quiet:\n+ print \'group reads mapped to the same region...\'\n+ \n+ allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+\'-intersect.tmp.bed\',ncol)\n+\n+ if len(allReadsStart) == 0:\n+ if not options.quiet:\n+ print \'no overlap found!!\'\n+ os.system(\'rm *tmp.*\')\n+ sys.exit(1)\n+ \n+ if not options.quiet:\n+ print \'count number of reads mapped to each bin...\'\n+ \n+ RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) \n+ \n+ if options.output_data == None:\n+ options.output_data = output+\'.txt\'\n+\n+ if options.summary_only: \n+ saveSummary(options.output_data,RegionProfile,options.nbins) \n+ \n+ else: \n+ saveProfile(options.output_data,RegionProfile,nRead)\n+ \n+ if not options.quiet:\n+ print \'results saved to: \'+ options.output_data \n+ \n+ if not (options.summary_only or options.profile_only ): \n+ # visualize \n+\n+ if options.window < 1:\n+ xlab = \'relative position (bins)\'\n+ else:\n+ xlab = \'relative position (bp)\'\n+\t \n+ if options.output_plot == None:\n+ options.output_plot = output+\'.pdf\'\n+\n+ title = options.plot_title+\'\\n n = \'+str(len(RegionProfile))\n+\n+ rscript = open("tmp.r","w")\n+ rscript.write("x <- read.table(\'"+options.output_data+"\')\\n")\n+ rscript.write("pdf(\'"+options.output_plot+"\')\\n")\n+ rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\\n")\n+ rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\\n")\n+ \n+ if options.window == 0:\n+ rscript.write("xticks <- seq("+str(options.nbins)+")\\n")\n+ else:\n+ rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\\n")\n+\n+ if options.ylim != \'min,max\':\n+ rscript.write("ylim=c("+options.ylim+")\\n")\n+ else:\n+ rscript.write("ylim=c(min(avg-err),max(avg+err))\\n")\n+ rscript.write("par(cex=1.5)\\n")\n+ #smooth\n+ if options.span >= 0.1:\n+ rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\\n")\n+ rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\\n")\n+ rscript.write("plot(xticks,avg,ylab=\'average coverage\',main=\'"+title+"\',xlab=\'"+xlab+"\',type=\'l\',lwd=0,ylim=ylim)\\n") \n+ rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col=\'slateblue1\',border=NA)\\n")\n+ rscript.write("lines(xticks,avg,type=\'l\',lwd=1)\\n") \n+ #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab=\'average coverage\',main=\'"+title+"\',xlab=\'"+xlab+"\',,ylim=c(min(avg-err),max(avg+err)))\\n")\n+ #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col=\'green\')\\n")\n+ #rscript.write("lines(xticks,avg,lwd=2)\\n")\n+ #rscript.write("lines(xticks,avg-err,col=\'green\')\\n")\n+ #rscript.write("lines(xticks,avg+err,col=\'green\')\\n")\n+ rscript.write("dev.off()\\n")\n+ rscript.close()\n+\n+ os.system("R --vanilla < tmp.r") \n+ \n+ # remove intermediate output\n+ os.system(\'rm *tmp.*\')\n+\n+ \n+if __name__ == "__main__":\n+ main()\n' |