Mercurial > repos > xuebing > sharplabtool
comparison mytools/intersectSig.py @ 9:87eb5c5ddfe9
Uploaded
| author | xuebing | 
|---|---|
| date | Fri, 09 Mar 2012 20:01:43 -0500 | 
| parents | f0dc65e7f6c0 | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 8:361ec1c0479d | 9:87eb5c5ddfe9 | 
|---|---|
| 1 ''' | |
| 2 find overlap and test signifiance | |
| 3 ''' | |
| 4 | |
| 5 import os,sys | |
| 6 | |
| 7 def lineCount(filename): | |
| 8 if os.stat(filename).st_size == 0: | |
| 9 return 0 | |
| 10 with open(filename) as f: | |
| 11 for i, l in enumerate(f): | |
| 12 pass | |
| 13 print i | |
| 14 return i+1 | |
| 15 | |
| 16 def intersect(fileA,fileB,outfile,fraction,reciprocal): | |
| 17 # return fileA intervals that overlap with interval in fileB | |
| 18 cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile | |
| 19 #print cmd | |
| 20 os.system(cmd) | |
| 21 | |
| 22 def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): | |
| 23 # shuffle fileA N times, return the distribution of overlaps | |
| 24 nOverlap = [] | |
| 25 for i in range(N): | |
| 26 # shuffle fileA using shuffleBed | |
| 27 #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' | |
| 28 # using random_interval.py | |
| 29 cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile | |
| 30 os.system(cmd) | |
| 31 intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) | |
| 32 nOverlap.append(lineCount('tmp')) | |
| 33 os.system('rm tmp') | |
| 34 os.system('rm fileA.shuffled') | |
| 35 return nOverlap | |
| 36 | |
| 37 def main(): | |
| 38 fileA = sys.argv[1] | |
| 39 fileB = sys.argv[2] | |
| 40 outfile = sys.argv[3] | |
| 41 outplot = sys.argv[4] | |
| 42 outshuffle = sys.argv[5] | |
| 43 N = int(sys.argv[6]) # times to shuffle | |
| 44 genomefile = sys.argv[7] | |
| 45 fraction = sys.argv[8] | |
| 46 if len(sys.argv) == 10: | |
| 47 reciprocal = sys.argv[9] # can only be '-r' | |
| 48 else: | |
| 49 reciprocal = '' | |
| 50 | |
| 51 #print sys.argv | |
| 52 | |
| 53 # number of lines in input | |
| 54 nA = lineCount(fileA) | |
| 55 nB = lineCount(fileB) | |
| 56 | |
| 57 # intersect on real data | |
| 58 intersect(fileA,fileB,outfile,fraction,reciprocal) | |
| 59 # number of overlaps | |
| 60 nOverlapReal = lineCount(outfile) | |
| 61 | |
| 62 #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal | |
| 63 | |
| 64 # shuffle fileA to estimate background | |
| 65 nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) | |
| 66 out = open(outshuffle,'w') | |
| 67 out.write("\t".join(map(str,nOverlapNull))) | |
| 68 out.close() | |
| 69 | |
| 70 # plot histogram | |
| 71 rscript = open('tmp.r','w') | |
| 72 rscript.write("options(warn=-1)\n") | |
| 73 rscript.write("x0 <- "+str(nOverlapReal)+"\n") | |
| 74 rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") | |
| 75 rscript.write("library(MASS)\n") | |
| 76 rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") | |
| 77 rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") | |
| 78 rscript.write("pdf('"+outplot+"')\n") | |
| 79 rscript.write("library(grid)\n") | |
| 80 rscript.write("library(VennDiagram)\n") | |
| 81 rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n") | |
| 82 rscript.write("grid.draw(venn)\n") | |
| 83 rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") | |
| 84 rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") | |
| 85 rscript.write("points(x0,0,col='red')\n") | |
| 86 rscript.write("dev.off()\n") | |
| 87 rscript.close() | |
| 88 os.system("R --vanilla < tmp.r") | |
| 89 os.system('rm tmp.r') | |
| 90 main() | 
