Mercurial > repos > cristian > rbgoa
comparison gomwu.functions.R @ 0:91261b42c07e draft
"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author | cristian |
---|---|
date | Thu, 14 Apr 2022 13:28:05 +0000 |
parents | |
children | f7287f82602f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:91261b42c07e |
---|---|
1 clusteringGOs=function(gen2go,div,cutHeight) { | |
2 inname=paste("dissim0_",div,"_",gen2go,sep="") | |
3 outname=paste("cl_",inname,sep="") | |
4 if (!file.exists(outname)) { | |
5 diss=read.table(inname,sep="\t",header=T,check.names=F) | |
6 row.names(diss)=names(diss) | |
7 hc=hclust(as.dist(diss),method="complete") | |
8 cc=cutree(hc,h=cutHeight) | |
9 write.csv(cc,file=outname,quote=F) | |
10 } | |
11 } | |
12 | |
13 | |
14 | |
15 #--------------- | |
16 gomwuStats=function(input,goDatabase,goAnnotations, goDivision, scriptdir, Module=FALSE, Alternative="t", adjust.multcomp="BH", clusterCutHeight=0.25,largest=0.1,smallest=5,perlPath="perl", shuffle.reps=20){ | |
17 | |
18 extraOptions=paste("largest=",largest," smallest=",smallest," cutHeight=",clusterCutHeight,sep="") | |
19 if (Module==TRUE) { adjust.multcomp="shuffle" } | |
20 gomwu_a = paste(scriptdir, "gomwu_a.pl", sep="/") | |
21 gomwu_b = paste(scriptdir, "gomwu_b.pl", sep="/") | |
22 system(paste(perlPath, gomwu_a,goDatabase,goAnnotations,input,goDivision,extraOptions)) | |
23 clusteringGOs(goAnnotations,goDivision,clusterCutHeight) | |
24 system(paste(perlPath, gomwu_b,goAnnotations,input,goDivision)) | |
25 | |
26 inname=paste(goDivision,"_",input,sep="") | |
27 rsq=read.table(inname,sep="\t",header=T) | |
28 rsq$term=as.factor(rsq$term) | |
29 | |
30 mwut.t=TRUE | |
31 if (length(levels(as.factor(rsq$value)))==2) { | |
32 cat("Binary classification detected; will perform Fisher's test\n"); | |
33 mwut.t=F | |
34 rr=fisherTest(rsq) | |
35 } else { | |
36 if (Module==TRUE) { | |
37 rsq.f=rsq | |
38 rsq.f$value=as.numeric(rsq.f$value>0) | |
39 rf=fisherTest(rsq.f) | |
40 rsq.m=rsq[rsq$value>0,] | |
41 rsq.m$term=factor(rsq.m$term,levels=unique(rsq.m$term)) | |
42 rrm=mwuTest(rsq.m,"g") | |
43 rr0=rf[rf$term %in% rrm$term,] | |
44 rr1=rf[!(rf$term %in% rrm$term),] | |
45 rr0=rr0[order(rr0$term),] | |
46 rrm=rrm[order(rrm$term),] | |
47 rr0$pval=rr0$pval*rrm$pval | |
48 rr=rbind(rr0,rr1) | |
49 } else { | |
50 cat("Continuous measure of interest: will perform MWU test\n"); | |
51 rr=mwuTest(rsq,Alternative) | |
52 } | |
53 } | |
54 | |
55 if (adjust.multcomp=="shuffle"){ | |
56 message("shuffling values to calculate FDR, ",shuffle.reps," reps") | |
57 reps=shuffle.reps | |
58 spv=c() | |
59 for (i in 1:reps) { | |
60 message("replicate ",i) | |
61 rsqq=rsq | |
62 rsqq$value=sample(rsq$value) | |
63 if (Module==TRUE) { | |
64 rsq.f=rsqq | |
65 rsq.f$value=as.numeric(rsq.f$value>0) | |
66 rf=fisherTest(rsq.f) | |
67 rsq.m=rsqq[rsqq$value>0,] | |
68 rsq.m$term=factor(rsq.m$term,levels=unique(rsq.m$term)) | |
69 rrm=mwuTest(rsq.m,"g") | |
70 rr0=rf[rf$term %in% rrm$term,] | |
71 rr1=rf[!(rf$term %in% rrm$term),] | |
72 rr0=rr0[order(rr0$term),] | |
73 rrm=rrm[order(rrm$term),] | |
74 rr0$pval=rr0$pval*rrm$pval | |
75 rs=rbind(rr0,rr1) | |
76 } else { | |
77 if (mwut.t==TRUE) { rs=mwuTest(rsqq,Alternative) } else { rs=fisherTest(rsqq) } | |
78 } | |
79 spv=append(spv,rs$pval) | |
80 } | |
81 fdr=c() | |
82 for (p in rr$pval){ | |
83 fdr=append(fdr,(sum(spv<=p)/reps)/sum(rr$pval<=p)) | |
84 } | |
85 fdr[fdr>1]=1 | |
86 } else { | |
87 fdr=p.adjust(rr$pval,method=adjust.multcomp) | |
88 } | |
89 | |
90 message(sum(fdr<0.1)," GO terms at 10% FDR") | |
91 rr$p.adj=fdr | |
92 fname=paste("MWU_",inname,sep="") | |
93 write.table(rr,fname,row.names=F) | |
94 } | |
95 | |
96 #--------------------- | |
97 mwuTest=function(gotable,Alternative) { | |
98 gos=gotable | |
99 terms=levels(gos$term) | |
100 gos$seq=as.character(gos$seq) | |
101 nrg=gos[!duplicated(gos$seq),5] | |
102 names(nrg)=gos[!duplicated(gos$seq),4] | |
103 # nrg=nrg+rnorm(nrg,sd=0.01) # to be able to do exact wilcox test | |
104 rnk=rank(nrg) | |
105 names(rnk)=names(nrg) | |
106 pvals=c();drs=c();nams=c();levs=c();nseqs=c() | |
107 for (t in terms){ | |
108 got=gos[gos$term==t,] | |
109 got=got[!duplicated(got$seq),] | |
110 ngot=gos[gos$term!=t,] | |
111 ngot=ngot[!duplicated(ngot$seq),] | |
112 ngot=ngot[!(ngot$seq %in% got$seq),] | |
113 sgo.yes=got$seq | |
114 n1=length(sgo.yes) | |
115 sgo.no=ngot$seq | |
116 n2=length(sgo.no) | |
117 #message(t," wilcox:",n1," ",n2) | |
118 wi=wilcox.test(nrg[sgo.yes],nrg[sgo.no],alternative=Alternative) # removed correct=FALSE | |
119 r1=sum(rnk[sgo.yes])/n1 | |
120 r0=sum(rnk[sgo.no])/n2 | |
121 dr=r1-r0 | |
122 drs=append(drs,round(dr,0)) | |
123 levs=append(levs,got$lev[1]) | |
124 nams=append(nams,as.character(got$name[1])) | |
125 pvals=append(pvals,wi$p.value) | |
126 nseqs=append(nseqs,n1) | |
127 } | |
128 res=data.frame(cbind("delta.rank"=drs,"pval"=pvals,"level"=levs,nseqs)) | |
129 res=cbind(res,"term"=as.character(terms),"name"=nams) | |
130 res$pval=as.numeric(as.character(res$pval)) | |
131 res$delta.rank=as.numeric(as.character(res$delta.rank)) | |
132 res$level=as.numeric(as.character(res$level)) | |
133 res$nseqs=as.numeric(as.character(res$nseqs)) | |
134 return(res) | |
135 } | |
136 #------------------------ | |
137 fisherTest=function(gotable) { | |
138 gos=gotable | |
139 terms=levels(gos$term) | |
140 gos$seq=as.character(gos$seq) | |
141 pft=c();nam=c();lev=c();nseqs=c() | |
142 for (t in terms) { | |
143 got=gos[gos$term==t,] | |
144 got=got[!duplicated(got$seq),] | |
145 ngot=gos[gos$term!=t,] | |
146 ngot=ngot[!duplicated(ngot$seq),] | |
147 ngot=ngot[!(ngot$seq %in% got$seq),] | |
148 go.sig=sum(got$value) | |
149 go.ns=length(got[,1])-go.sig | |
150 ngo.sig=sum(ngot$value) | |
151 ngo.ns=length(ngot[,1])-ngo.sig | |
152 sig=c(go.sig,ngo.sig) # number of significant genes belonging and not belonging to the tested GO category | |
153 ns=c(go.ns,ngo.ns) # number of not-significant genes belonging and not belonging to the tested GO category | |
154 mm=matrix(c(sig,ns),nrow=2,dimnames=list(ns=c("go","notgo"),sig=c("go","notgo"))) | |
155 ff=fisher.test(mm,alternative="greater") | |
156 pft=append(pft,ff$p.value) | |
157 nam=append(nam,as.character(got$name[1])) | |
158 lev=append(lev,got$lev[1]) | |
159 nseqs=append(nseqs,length(got[,1])) | |
160 } | |
161 res=data.frame(cbind("delta.rank"=rep(0),"pval"=pft,"level"=lev,nseqs,"term"=terms,"name"=nam)) | |
162 res[,1]=as.numeric(as.character(res[,1])) | |
163 res[,2]=as.numeric(as.character(res[,2])) | |
164 res[,3]=as.numeric(as.character(res[,3])) | |
165 res$nseqs=as.numeric(as.character(res$nseqs)) | |
166 return(res) | |
167 } | |
168 | |
169 #------------------------- | |
170 gomwuPlot=function(inFile,goAnnotations,goDivision,level1=0.1,level2=0.05,level3=0.01,absValue=-log(0.05,10),adjusted=TRUE,txtsize=1,font.family="sans",treeHeight=0.5,colors=NULL) { | |
171 require(ape) | |
172 | |
173 input=inFile | |
174 in.mwu=paste("MWU",goDivision,input,sep="_") | |
175 in.dissim=paste("dissim",goDivision,input,goAnnotations,sep="_") | |
176 | |
177 cutoff=-log(level1,10) | |
178 pv=read.table(in.mwu,header=T) | |
179 row.names(pv)=pv$term | |
180 in.raw=paste(goDivision,input,sep="_") | |
181 rsq=read.table(in.raw,sep="\t",header=T) | |
182 rsq$term=as.factor(rsq$term) | |
183 | |
184 if (adjusted==TRUE) { pvals=pv$p.adj } else { pvals=pv$pval } | |
185 heat=data.frame(cbind("pval"=pvals)) | |
186 row.names(heat)=pv$term | |
187 heat$pval=-log(heat$pval+1e-15,10) | |
188 heat$direction=0 | |
189 heat$direction[pv$delta.rank>0]=1 | |
190 if (cutoff>0) { | |
191 goods=subset(heat,pval>=cutoff) | |
192 } else { | |
193 goods.names=unique(rsq$term[abs(rsq$value)>=absValue]) | |
194 goods=heat[row.names(heat) %in% goods.names,] | |
195 } | |
196 | |
197 if (is.null(colors) | length(colors)<4 ) { | |
198 colors=c("dodgerblue2","firebrick1","skyblue2","lightcoral") | |
199 if (sum(goods$direction)==nrow(goods) | sum(goods$direction)==0) { | |
200 colors=c("black","black","grey50","grey50") | |
201 } | |
202 } | |
203 goods.names=row.names(goods) | |
204 | |
205 # reading and subsetting dissimilarity matrix | |
206 diss=read.table(in.dissim,sep="\t",header=T,check.names=F) | |
207 row.names(diss)=names(diss) | |
208 diss.goods=diss[goods.names,goods.names] | |
209 | |
210 # how many genes out of what we started with we account for with our best categories? | |
211 good.len=c();good.genes=c() | |
212 for (g in goods.names) { | |
213 sel=rsq[rsq$term==g,] | |
214 pass=abs(sel$value)>=absValue | |
215 sel=sel[pass,] | |
216 good.genes=append(good.genes,as.character(sel$seq)) | |
217 good.len=append(good.len,nrow(sel)) | |
218 } | |
219 ngenes=length(unique(good.genes)) | |
220 | |
221 #hist(rsq$value) | |
222 totSum=length(unique(rsq$seq[abs(rsq$value)>=absValue])) | |
223 row.names(goods)=paste(good.len,"/",pv[pv$term %in% goods.names,]$nseqs," ",pv[pv$term %in% goods.names,]$name,sep="") | |
224 row.names(heat)=paste(good.len,"/",pv$nseqs," ",pv$name,sep="") | |
225 row.names(diss.goods)=paste(good.len,"/",pv[pv$term %in% goods.names,]$nseqs," ",pv[pv$term %in% goods.names,]$name,sep="") | |
226 | |
227 # clustering terms better than cutoff | |
228 GO.categories=as.dist(diss.goods) | |
229 cl.goods=hclust(GO.categories,method="average") | |
230 labs=cl.goods$labels[cl.goods$order] # saving the labels to order the plot | |
231 goods=goods[labs,] | |
232 labs=sub(" activity","",labs) | |
233 | |
234 old.par <- par( no.readonly = TRUE ) | |
235 | |
236 plots=layout(matrix(c(1,2,3),1,3,byrow=T),c(treeHeight,3,1),TRUE) | |
237 | |
238 par(mar = c(2,2,0.85,0)) | |
239 plot(as.phylo(cl.goods),show.tip.label=FALSE,cex=0.0000001) | |
240 step=100 | |
241 left=1 | |
242 top=step*(2+length(labs)) | |
243 | |
244 par(mar = c(0,0,0.3,0)) | |
245 plot(c(1:top)~c(1:top),type="n",axes=F,xlab="",ylab="") | |
246 ii=1 | |
247 goods$color=1 | |
248 goods$color[goods$direction==1 & goods$pval>cutoff]=colors[4] | |
249 goods$color[goods$direction==0 & goods$pval>cutoff]=colors[3] | |
250 goods$color[goods$direction==1 & goods$pval>(-log(level2,10))]=colors[2] | |
251 goods$color[goods$direction==0 & goods$pval>(-log(level2,10))]=colors[1] | |
252 goods$color[goods$direction==1 & goods$pval>(-log(level3,10))]=colors[2] | |
253 goods$color[goods$direction==0 & goods$pval>(-log(level3,10))]=colors[1] | |
254 for (i in length(labs):1) { | |
255 ypos=top-step*ii | |
256 ii=ii+1 | |
257 if (goods$pval[i]> -log(level3,10)) { | |
258 text(left,ypos,labs[i],font=2,cex=1*txtsize,col=goods$color[i],adj=c(0,0),family=font.family) | |
259 } else { | |
260 if (goods$pval[i]>-log(level2,10)) { | |
261 text(left,ypos,labs[i],font=1,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family) | |
262 } else { | |
263 # if (goods$pval[i]>cutoff) { | |
264 # text(left,ypos,labs[i],font=3,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family) | |
265 # } else { | |
266 text(left,ypos,labs[i],font=3,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family) | |
267 #} | |
268 } | |
269 } | |
270 } | |
271 | |
272 par(mar = c(3,1,1,0)) | |
273 | |
274 plot(c(1:top)~c(1:top),type="n",axes=F,xlab="",ylab="") | |
275 text(left,top-step*2,paste("p < ",level3,sep=""),font=2,cex=1* txtsize,adj=c(0,0),family=font.family) | |
276 text(left,top-step*3,paste("p < ",level2,sep=""),font=1,cex=0.8* txtsize,adj=c(0,0),family=font.family) | |
277 text(left,top-step*4,paste("p < ",10^(-cutoff),sep=""),font=3,col="grey50",cex=0.8* txtsize,adj=c(0,0),family=font.family) | |
278 | |
279 message("GO terms dispayed: ",length(goods.names)) | |
280 message("\"Good genes\" accounted for: ", ngenes," out of ",totSum, " ( ",round(100*ngenes/totSum,0), "% )") | |
281 par(old.par) | |
282 goods$pval=10^(-1*goods$pval) | |
283 return(list(goods,cl.goods)) | |
284 } | |
285 | |
286 #------------------ | |
287 # returns non-overlapping GO categories based on dissimilarity table | |
288 indepGO=function(dissim.table,min.similarity=1) { | |
289 tt=read.table(dissim.table,sep="\t", header=TRUE) | |
290 tt=as.matrix(tt) | |
291 diag(tt)=1 | |
292 for (i in 1:ncol(tt)) { | |
293 mins=apply(tt,2,min) | |
294 if (min(mins)>=min.similarity) { break } | |
295 sums=apply(tt,2,sum) | |
296 worst=which(sums==min(sums))[1] | |
297 # cat("\n",worsts,"\n") | |
298 # gw=c() | |
299 # for(w in worsts) { gw=append(gw,sum(tt[,w]==1)) } | |
300 # cat(gw) | |
301 # worst=worsts[gw==min(gw)] | |
302 # cat("\n",i," removing",worst,"; sum =",sums[worst]) | |
303 tt=tt[-worst,-worst] | |
304 mins=mins[-worst] | |
305 # cat(" new min =",min(mins)) | |
306 } | |
307 goods=colnames(tt) | |
308 goods=gsub("GO\\.","GO:",goods) | |
309 goods=gsub("\\.GO",";GO",goods) | |
310 } | |
311 |