## Mercurial > repos > xuebing > sharplabtool

### annotate mytools/intersectSig.py @ 9:87eb5c5ddfe9

Find changesets by keywords (author, files, the commit message), revision
number or hash, or revset expression.

Uploaded

author | xuebing |
---|---|

date | Fri, 09 Mar 2012 20:01:43 -0500 |

parents | f0dc65e7f6c0 |

children |

rev | line source |
---|---|

7 | 1 ''' |

2 find overlap and test signifiance | |

3 ''' | |

4 | |

5 import os,sys | |

6 | |

7 def lineCount(filename): | |

8 if os.stat(filename).st_size == 0: | |

9 return 0 | |

10 with open(filename) as f: | |

11 for i, l in enumerate(f): | |

12 pass | |

13 print i | |

14 return i+1 | |

15 | |

16 def intersect(fileA,fileB,outfile,fraction,reciprocal): | |

17 # return fileA intervals that overlap with interval in fileB | |

18 cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile | |

19 #print cmd | |

20 os.system(cmd) | |

21 | |

22 def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): | |

23 # shuffle fileA N times, return the distribution of overlaps | |

24 nOverlap = [] | |

25 for i in range(N): | |

26 # shuffle fileA using shuffleBed | |

27 #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' | |

28 # using random_interval.py | |

29 cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile | |

30 os.system(cmd) | |

31 intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) | |

32 nOverlap.append(lineCount('tmp')) | |

33 os.system('rm tmp') | |

34 os.system('rm fileA.shuffled') | |

35 return nOverlap | |

36 | |

37 def main(): | |

38 fileA = sys.argv[1] | |

39 fileB = sys.argv[2] | |

40 outfile = sys.argv[3] | |

41 outplot = sys.argv[4] | |

42 outshuffle = sys.argv[5] | |

43 N = int(sys.argv[6]) # times to shuffle | |

44 genomefile = sys.argv[7] | |

45 fraction = sys.argv[8] | |

46 if len(sys.argv) == 10: | |

47 reciprocal = sys.argv[9] # can only be '-r' | |

48 else: | |

49 reciprocal = '' | |

50 | |

51 #print sys.argv | |

52 | |

53 # number of lines in input | |

54 nA = lineCount(fileA) | |

55 nB = lineCount(fileB) | |

56 | |

57 # intersect on real data | |

58 intersect(fileA,fileB,outfile,fraction,reciprocal) | |

59 # number of overlaps | |

60 nOverlapReal = lineCount(outfile) | |

61 | |

62 #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal | |

63 | |

64 # shuffle fileA to estimate background | |

65 nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) | |

66 out = open(outshuffle,'w') | |

67 out.write("\t".join(map(str,nOverlapNull))) | |

68 out.close() | |

69 | |

70 # plot histogram | |

71 rscript = open('tmp.r','w') | |

72 rscript.write("options(warn=-1)\n") | |

73 rscript.write("x0 <- "+str(nOverlapReal)+"\n") | |

74 rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") | |

75 rscript.write("library(MASS)\n") | |

76 rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") | |

77 rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") | |

78 rscript.write("pdf('"+outplot+"')\n") | |

79 rscript.write("library(grid)\n") | |

80 rscript.write("library(VennDiagram)\n") | |

81 rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n") | |

82 rscript.write("grid.draw(venn)\n") | |

83 rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") | |

84 rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") | |

85 rscript.write("points(x0,0,col='red')\n") | |

86 rscript.write("dev.off()\n") | |

87 rscript.close() | |

88 os.system("R --vanilla < tmp.r") | |

89 os.system('rm tmp.r') | |

90 main() |