Repository 'sharplab_interval_analysis'
hg clone https://toolshed.g2.bx.psu.edu/repos/xuebing/sharplab_interval_analysis

Changeset 2:a861f40db890 (2012-03-20)
Previous changeset 1:1a177d17b27d (2012-03-20) Next changeset 3:b11a21c704ec (2012-03-20)
Commit message:
Uploaded
added:
alignr.py
b
diff -r 1a177d17b27d -r a861f40db890 alignr.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/alignr.py Tue Mar 20 15:15:00 2012 -0400
[
b'@@ -0,0 +1,353 @@\n+\'\'\'\n+the scripts takes two files as input, and compute the coverage of \n+features in input 1 across features in input 2. Features in input 2 are \n+divided into bins and coverage is computed for each bin.  \n+\n+please check the help information by typing:\n+\n+    python align.py -h\n+\n+\n+requirement:\n+    please install the following tools first:\n+    bedtools:   for read/region overlapping, http://code.google.com/p/bedtools/\n+    \n+\'\'\'\n+\n+import os,sys,os.path\n+from optparse import OptionParser\n+\n+def lineCount(filename):\n+    with open(filename) as f:\n+        for i, l in enumerate(f):\n+            pass\n+    return i + 1\n+\n+def combineFilename(f1,f2):\n+    \'\'\'\n+    fuse two file names into one\n+    \'\'\'\n+    return f1.split(\'/\')[-1]+\'-\'+f2.split(\'/\')[-1]\n+\n+def checkFormat(filename1,filename2,input1format):\n+    \'\'\'\n+    check the format of input files\n+    \'\'\'\n+\n+    # file1\n+    # read the first line, see how many filds\n+    ncol1 = 6\n+    if input1format == "BED":\n+        f = open(filename1)\n+        line = f.readline().strip().split(\'\\t\')\n+        ncol1 = len(line)\n+        if ncol1 < 3:\n+            print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."\n+            sys.exit(1)\n+        f.close()\n+     \n+    # file2\n+    f = open(filename2)\n+    line = f.readline().strip().split(\'\\t\')\n+    ncol2 = len(line)  \n+    if ncol2 < 3:\n+        print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."\n+        sys.exit(1)        \n+\n+    return ncol1,ncol2\n+\n+\n+def makeBed(filename,ncol):\n+    \'\'\'\n+    add up to 6 column\n+    \'\'\'\n+    f = open(filename)\n+    outfile = filename+\'.tmp.bed\'\n+    outf = open(outfile,\'w\')\n+    if ncol == 3:\n+        for line in f:\n+            outf.write(line.strip()+\'\\t.\\t0\\t+\\n\')\n+    elif ncol == 4:\n+        for line in f:\n+            outf.write(line.strip()+\'\\t0\\t+\\n\')\n+    if ncol == 5:\n+        for line in f:\n+            outf.write(line.strip()+\'\\t+\\n\')\n+    f.close()\n+    outf.close()\n+    return outfile\n+    \n+def makeWindow(filename,window):\n+\n+    outfile = filename+\'-window=\'+str(window)+\'.tmp.bed\'\n+    if not os.path.exists(outfile):\n+        f=open(filename)\n+        out = open(outfile,\'w\')\n+        lines = f.readlines()\n+        if \'track\' in lines[0]:\n+            del lines[0]\n+        for line in lines:\n+            flds = line.strip().split(\'\\t\')\n+\n+            #new position\n+            center = (int(flds[1]) + int(flds[2]))/2\n+            start = center - window\n+            end = center + window\n+            if start >= 0:\n+                flds[1] = str(start)\n+                flds[2] = str(end)\n+                out.write(\'\\t\'.join(flds)+\'\\n\')\n+        f.close()\n+        out.close()\n+    return outfile\n+\n+def groupReadsMapped2aRegion(filename,ncol):\n+    \'\'\'\n+    read output from intersectBED\n+    find all reads mapped to each region\n+    \'\'\'\n+    try:\n+        f=open(filename)\n+        #If filename cannot be opened, print an error message and exit\n+    except IOError:\n+        print "could not open",filename,"Are you sure this file exists?"\n+        sys.exit(1)\n+    lines = f.readlines()\n+    \n+    allReadsStart = {}\n+    allReadsEnd = {}\n+    regionStrand = {}\n+    regionStart = {}\n+    regionEnd = {}\n+    \n+    for line in lines:\n+        flds = line.strip().split(\'\\t\')\n+        key = \'_\'.join(flds[ncol:(ncol+4)])\n+        if not allReadsStart.has_key(key):\n+            allReadsStart[key] = list()\n+            allReadsEnd[key] = list()\n+        #print flds[ncol+0],flds[ncol+1],flds[ncol+2]\n+        allReadsStart[key].append(int(flds[1]))\n+        allReadsEnd[key].append(int(flds[2]))\n+        regionStrand[key] = flds[ncol+5]  \n+        regionStart[key] = int(flds[ncol+1])    \n+        regionEnd[key] = int(flds[ncol+2])      \n+    return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)\n+            \n+'..b'ions.inputB))\n+    \n+    if not options.plot:\n+        if options.aformat == "BAM":\n+            cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + \' -bed -split \'\n+        else:\n+            cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)\n+        if options.strandness:\n+            cmd = cmd + \' -s\'\n+        cmd = cmd +" -wo > "+ output+\'-intersect.tmp.bed\'\n+        if not options.quiet:\n+            print "search for overlappings: "+cmd\n+        status = os.system(cmd)\n+        if status != 0:\n+            sys.exit(1)\n+\n+    \n+    if not options.quiet:\n+        print \'group reads mapped to the same region...\'\n+    \n+    allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+\'-intersect.tmp.bed\',ncol)\n+\n+    if len(allReadsStart) == 0:\n+        if not options.quiet:\n+            print \'no overlap found!!\'\n+        os.system(\'rm *tmp.*\')\n+        sys.exit(1)\n+    \n+    if not options.quiet:\n+        print \'count number of reads mapped to each bin...\'\n+    \n+    RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) \n+   \n+    if options.output_data == None:\n+        options.output_data = output+\'.txt\'\n+\n+    if options.summary_only:  \n+        saveSummary(options.output_data,RegionProfile,options.nbins) \n+    \n+    else:                 \n+        saveProfile(options.output_data,RegionProfile,nRead)\n+    \n+    if not options.quiet:\n+        print \'results saved to: \'+ options.output_data \n+        \n+    if not (options.summary_only or options.profile_only ):          \n+        # visualize \n+\n+        if options.window < 1:\n+            xlab = \'relative position (bins)\'\n+        else:\n+            xlab = \'relative position (bp)\'\n+\t            \n+        if options.output_plot == None:\n+            options.output_plot = output+\'.pdf\'\n+\n+        title = options.plot_title+\'\\n n = \'+str(len(RegionProfile))\n+\n+        rscript = open("tmp.r","w")\n+        rscript.write("x <- read.table(\'"+options.output_data+"\')\\n")\n+        rscript.write("pdf(\'"+options.output_plot+"\')\\n")\n+        rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\\n")\n+        rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\\n")\n+        \n+        if options.window == 0:\n+            rscript.write("xticks <- seq("+str(options.nbins)+")\\n")\n+        else:\n+            rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\\n")\n+\n+        if options.ylim != \'min,max\':\n+            rscript.write("ylim=c("+options.ylim+")\\n")\n+        else:\n+            rscript.write("ylim=c(min(avg-err),max(avg+err))\\n")\n+        rscript.write("par(cex=1.5)\\n")\n+        #smooth\n+        if options.span >= 0.1:\n+            rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\\n")\n+            rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\\n")\n+        rscript.write("plot(xticks,avg,ylab=\'average coverage\',main=\'"+title+"\',xlab=\'"+xlab+"\',type=\'l\',lwd=0,ylim=ylim)\\n")   \n+        rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col=\'slateblue1\',border=NA)\\n")\n+        rscript.write("lines(xticks,avg,type=\'l\',lwd=1)\\n")   \n+        #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab=\'average coverage\',main=\'"+title+"\',xlab=\'"+xlab+"\',,ylim=c(min(avg-err),max(avg+err)))\\n")\n+        #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col=\'green\')\\n")\n+        #rscript.write("lines(xticks,avg,lwd=2)\\n")\n+        #rscript.write("lines(xticks,avg-err,col=\'green\')\\n")\n+        #rscript.write("lines(xticks,avg+err,col=\'green\')\\n")\n+        rscript.write("dev.off()\\n")\n+        rscript.close()\n+\n+        os.system("R --vanilla < tmp.r")    \n+    \n+    # remove intermediate output\n+    os.system(\'rm *tmp.*\')\n+\n+    \n+if __name__ == "__main__":\n+    main()\n'