Mercurial > repos > xuebing > sharplabtool
view tools/mytools/intersectSig.py @ 2:c2a356708570
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:45:42 -0500 |
parents | 9071e359b9a3 |
children |
line wrap: on
line source
''' find overlap and test signifiance ''' import os,sys def lineCount(filename): if os.stat(filename).st_size == 0: return 0 with open(filename) as f: for i, l in enumerate(f): pass print i return i+1 def intersect(fileA,fileB,outfile,fraction,reciprocal): # return fileA intervals that overlap with interval in fileB cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile #print cmd os.system(cmd) def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): # shuffle fileA N times, return the distribution of overlaps nOverlap = [] for i in range(N): # shuffle fileA using shuffleBed #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' # using random_interval.py cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile os.system(cmd) intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) nOverlap.append(lineCount('tmp')) os.system('rm tmp') os.system('rm fileA.shuffled') return nOverlap def main(): fileA = sys.argv[1] fileB = sys.argv[2] outfile = sys.argv[3] outplot = sys.argv[4] outshuffle = sys.argv[5] N = int(sys.argv[6]) # times to shuffle genomefile = sys.argv[7] fraction = sys.argv[8] if len(sys.argv) == 10: reciprocal = sys.argv[9] # can only be '-r' else: reciprocal = '' #print sys.argv # number of lines in input nA = lineCount(fileA) nB = lineCount(fileB) # intersect on real data intersect(fileA,fileB,outfile,fraction,reciprocal) # number of overlaps nOverlapReal = lineCount(outfile) #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal # shuffle fileA to estimate background nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) out = open(outshuffle,'w') out.write("\t".join(map(str,nOverlapNull))) out.close() # plot histogram rscript = open('tmp.r','w') rscript.write("options(warn=-1)\n") rscript.write("x0 <- "+str(nOverlapReal)+"\n") rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") rscript.write("library(MASS)\n") rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") rscript.write("pdf('"+outplot+"')\n") rscript.write("library(grid)\n") rscript.write("library(VennDiagram)\n") rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n") rscript.write("grid.draw(venn)\n") rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") rscript.write("points(x0,0,col='red')\n") rscript.write("dev.off()\n") rscript.close() os.system("R --vanilla < tmp.r") os.system('rm tmp.r') main()