diff alignr.py @ 14:76e1b1b21cce default tip

Deleted selected files
author xuebing
date Tue, 13 Mar 2012 19:05:10 -0400
parents 292186c14b08
children
line wrap: on
line diff
--- a/alignr.py	Sat Mar 10 08:17:36 2012 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,353 +0,0 @@
-'''
-the scripts takes two files as input, and compute the coverage of 
-features in input 1 across features in input 2. Features in input 2 are 
-divided into bins and coverage is computed for each bin.  
-
-please check the help information by typing:
-
-    python align.py -h
-
-
-requirement:
-    please install the following tools first:
-    bedtools:   for read/region overlapping, http://code.google.com/p/bedtools/
-    
-'''
-
-import os,sys,os.path
-from optparse import OptionParser
-
-def lineCount(filename):
-    with open(filename) as f:
-        for i, l in enumerate(f):
-            pass
-    return i + 1
-
-def combineFilename(f1,f2):
-    '''
-    fuse two file names into one
-    '''
-    return f1.split('/')[-1]+'-'+f2.split('/')[-1]
-
-def checkFormat(filename1,filename2,input1format):
-    '''
-    check the format of input files
-    '''
-
-    # file1
-    # read the first line, see how many filds
-    ncol1 = 6
-    if input1format == "BED":
-        f = open(filename1)
-        line = f.readline().strip().split('\t')
-        ncol1 = len(line)
-        if ncol1 < 3:
-            print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
-            sys.exit(1)
-        f.close()
-     
-    # file2
-    f = open(filename2)
-    line = f.readline().strip().split('\t')
-    ncol2 = len(line)  
-    if ncol2 < 3:
-        print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
-        sys.exit(1)        
-
-    return ncol1,ncol2
-
-
-def makeBed(filename,ncol):
-    '''
-    add up to 6 column
-    '''
-    f = open(filename)
-    outfile = filename+'.tmp.bed'
-    outf = open(outfile,'w')
-    if ncol == 3:
-        for line in f:
-            outf.write(line.strip()+'\t.\t0\t+\n')
-    elif ncol == 4:
-        for line in f:
-            outf.write(line.strip()+'\t0\t+\n')
-    if ncol == 5:
-        for line in f:
-            outf.write(line.strip()+'\t+\n')
-    f.close()
-    outf.close()
-    return outfile
-    
-def makeWindow(filename,window):
-
-    outfile = filename+'-window='+str(window)+'.tmp.bed'
-    if not os.path.exists(outfile):
-        f=open(filename)
-        out = open(outfile,'w')
-        lines = f.readlines()
-        if 'track' in lines[0]:
-            del lines[0]
-        for line in lines:
-            flds = line.strip().split('\t')
-
-            #new position
-            center = (int(flds[1]) + int(flds[2]))/2
-            start = center - window
-            end = center + window
-            if start >= 0:
-                flds[1] = str(start)
-                flds[2] = str(end)
-                out.write('\t'.join(flds)+'\n')
-        f.close()
-        out.close()
-    return outfile
-
-def groupReadsMapped2aRegion(filename,ncol):
-    '''
-    read output from intersectBED
-    find all reads mapped to each region
-    '''
-    try:
-        f=open(filename)
-        #If filename cannot be opened, print an error message and exit
-    except IOError:
-        print "could not open",filename,"Are you sure this file exists?"
-        sys.exit(1)
-    lines = f.readlines()
-    
-    allReadsStart = {}
-    allReadsEnd = {}
-    regionStrand = {}
-    regionStart = {}
-    regionEnd = {}
-    
-    for line in lines:
-        flds = line.strip().split('\t')
-        key = '_'.join(flds[ncol:(ncol+4)])
-        if not allReadsStart.has_key(key):
-            allReadsStart[key] = list()
-            allReadsEnd[key] = list()
-        #print flds[ncol+0],flds[ncol+1],flds[ncol+2]
-        allReadsStart[key].append(int(flds[1]))
-        allReadsEnd[key].append(int(flds[2]))
-        regionStrand[key] = flds[ncol+5]  
-        regionStart[key] = int(flds[ncol+1])    
-        regionEnd[key] = int(flds[ncol+2])      
-    return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)
-            
-            
-def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins):
-    '''
-    each region is divided into nbins
-    compute the number of reads covering each bin for each region 
-    '''
-    RegionProfile = {}
-    nRead = {}  # num of all reads in the region
-    for region in allReadsStart.keys():
-        RegionProfile[region] = [0]*nbins
-        nRead[region] = len(allReadsStart[region])
-        #print region,nRead[region],allReadsStart[region]
-        for i in range(nRead[region]):
-            RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins)
-    return RegionProfile,nRead
-    
-def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins):
-    '''
-    each region is divided into nbins,
-    add 1 to each bin covered by the read  
-    '''
-    L = regionEnd-regionStart
-    start = int(nbins*(readStart-regionStart)/L)
-    end = int(nbins*(readEnd-regionStart)/L)
-    if start < 0:
-        start = 0
-    if end > nbins:
-        end = nbins
-    if strand == '-':        
-        for i in range(start,end):
-            RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1
-    else: # if the 6th column of the input is not strand, will treat as + strand by default       
-        for i in range(start,end):
-            RegionCount[i] = RegionCount[i] + 1            
-    return RegionCount
-
-def saveProfile(filename,Profile,nRegion):
-    out = open(filename,'w')
-    for regionType in Profile.keys():
-        #print Profile[regionType]
-        out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n')    
-                    
-def saveSummary(filename,Profile,nbin):
-    out = open(filename+'.summary','w')
-
-    nfeat = len(Profile)
-    summaryprofile = [0]*nbin
-    for regionType in Profile.keys():
-        for i in range(nbin):
-            summaryprofile[i] += Profile[regionType][i]    
-    out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n')  
-    out.close()
-    # calculate standard error
-    out = open(filename+'.standarderror','w')
-    sd = [0.0]*nbin
-    u = [0.0]*nbin 
-    for i in range(nbin):
-        u[i] = float(summaryprofile[i])/nfeat
-        for regionType in Profile.keys():
-            sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2
-        sd[i] = sd[i]**0.5 / nfeat
-    out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n')  
-    out.close()    
-                
-def main():
-    usage = "usage: %prog [options] -a inputA -b inputB"
-    parser = OptionParser(usage)
-    parser.add_option("-a", dest="inputA",
-                      help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" )                                                
-    parser.add_option("-b",dest="inputB",
-                      help="(required) input file B, interval file" )                                                
-    parser.add_option("-f",dest="aformat",default="BED",
-                      help="Format of input file A. Can be BED (default) or BAM")
-    parser.add_option("-w",type='int',dest="window",
-                      help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" )     
-    parser.add_option("-n", type="int", dest="nbins",default=100,
-                        help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100")                    
-    parser.add_option("-s",action="store_true", dest="strandness",
-                      help="enforce strandness: require overlapping on the same strand. Default is off")
-    parser.add_option("-p",action="store_true", dest="plot",default=False,
-                      help="load existed intersectBed outputfile")
-    parser.add_option("-q", action="store_true", dest="quiet",default=False,
-                        help="suppress output on screen")
-    parser.add_option("-o", dest="output_data",
-                      help="(optional) output coverage file (txt) name." )
-    parser.add_option("-v", dest="output_plot",
-                      help="(optional) output plot (pdf) file name." )
-    parser.add_option("-l", dest="plot_title", default="",
-                      help="(optional) output title of the plot." )
-    parser.add_option("--ylim", dest="ylim", default="min,max",
-                      help="(optional) ylim of the plot" )
-    parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False,
-                        help="save profile summary only (no data for individual features)")
-    parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False,
-                        help="compute and plot standard deviation for each bin. used when --summary-only is on")
-    parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False,
-                        help="save profile only (no plot)")
-    parser.add_option("--span", type="float", dest="span",default=0.1,
-                        help="loess span smooth parameter, 0.1 ~ 1")                    
-    
-    (options, args) = parser.parse_args()
-
-    if options.inputA == None or options.inputB == None:
-        parser.error("Please specify two input files!!")
-
-    if not options.quiet:
-        print "checking input file format..."
-        
-    ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat)
-
-    if ncol2 < 6:
-        options.inputB = makeBed(options.inputB,ncol2)        
-        if not options.quiet:
-            print "fill up 6 columns"
-
-    if options.window > 0:
-        if not options.quiet:
-            print "making windows from "+options.inputB+"..." 
-        options.inputB = makeWindow(options.inputB,options.window)
-    
-    output = combineFilename(str(options.inputA),str(options.inputB))
-    
-    if not options.plot:
-        if options.aformat == "BAM":
-            cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split '
-        else:
-            cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)
-        if options.strandness:
-            cmd = cmd + ' -s'
-        cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed'
-        if not options.quiet:
-            print "search for overlappings: "+cmd
-        status = os.system(cmd)
-        if status != 0:
-            sys.exit(1)
-
-    
-    if not options.quiet:
-        print 'group reads mapped to the same region...'
-    
-    allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol)
-
-    if len(allReadsStart) == 0:
-        if not options.quiet:
-            print 'no overlap found!!'
-        os.system('rm *tmp.*')
-        sys.exit(1)
-    
-    if not options.quiet:
-        print 'count number of reads mapped to each bin...'
-    
-    RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) 
-   
-    if options.output_data == None:
-        options.output_data = output+'.txt'
-
-    if options.summary_only:  
-        saveSummary(options.output_data,RegionProfile,options.nbins) 
-    
-    else:                 
-        saveProfile(options.output_data,RegionProfile,nRead)
-    
-    if not options.quiet:
-        print 'results saved to: '+ options.output_data 
-        
-    if not (options.summary_only or options.profile_only ):          
-        # visualize 
-
-        if options.window < 1:
-            xlab = 'relative position (bins)'
-        else:
-            xlab = 'relative position (bp)'
-	            
-        if options.output_plot == None:
-            options.output_plot = output+'.pdf'
-
-        title = options.plot_title+'\n n = '+str(len(RegionProfile))
-
-        rscript = open("tmp.r","w")
-        rscript.write("x <- read.table('"+options.output_data+"')\n")
-        rscript.write("pdf('"+options.output_plot+"')\n")
-        rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n")
-        rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n")
-        
-        if options.window == 0:
-            rscript.write("xticks <- seq("+str(options.nbins)+")\n")
-        else:
-            rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n")
-
-        if options.ylim != 'min,max':
-            rscript.write("ylim=c("+options.ylim+")\n")
-        else:
-            rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
-        rscript.write("par(cex=1.5)\n")
-        #smooth
-        if options.span >= 0.1:
-            rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n")
-            rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n")
-        rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n")   
-        rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
-        rscript.write("lines(xticks,avg,type='l',lwd=1)\n")   
-        #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n")
-        #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n")
-        #rscript.write("lines(xticks,avg,lwd=2)\n")
-        #rscript.write("lines(xticks,avg-err,col='green')\n")
-        #rscript.write("lines(xticks,avg+err,col='green')\n")
-        rscript.write("dev.off()\n")
-        rscript.close()
-
-        os.system("R --vanilla < tmp.r")    
-    
-    # remove intermediate output
-    os.system('rm *tmp.*')
-
-    
-if __name__ == "__main__":
-    main()