diff gomwu.functions.R @ 0:91261b42c07e draft

"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author cristian
date Thu, 14 Apr 2022 13:28:05 +0000
parents
children f7287f82602f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gomwu.functions.R	Thu Apr 14 13:28:05 2022 +0000
@@ -0,0 +1,311 @@
+clusteringGOs=function(gen2go,div,cutHeight) {
+	inname=paste("dissim0_",div,"_",gen2go,sep="")
+	outname=paste("cl_",inname,sep="")
+	if (!file.exists(outname)) {
+		diss=read.table(inname,sep="\t",header=T,check.names=F)
+		row.names(diss)=names(diss)
+		hc=hclust(as.dist(diss),method="complete")
+		cc=cutree(hc,h=cutHeight)
+		write.csv(cc,file=outname,quote=F)
+	}
+}
+
+
+
+#---------------
+gomwuStats=function(input,goDatabase,goAnnotations, goDivision, scriptdir, Module=FALSE, Alternative="t", adjust.multcomp="BH", clusterCutHeight=0.25,largest=0.1,smallest=5,perlPath="perl", shuffle.reps=20){
+
+	extraOptions=paste("largest=",largest," smallest=",smallest," cutHeight=",clusterCutHeight,sep="")
+	if (Module==TRUE) { adjust.multcomp="shuffle" }
+    gomwu_a = paste(scriptdir, "gomwu_a.pl", sep="/")
+    gomwu_b = paste(scriptdir, "gomwu_b.pl", sep="/")
+	system(paste(perlPath, gomwu_a,goDatabase,goAnnotations,input,goDivision,extraOptions))
+	clusteringGOs(goAnnotations,goDivision,clusterCutHeight)
+	system(paste(perlPath, gomwu_b,goAnnotations,input,goDivision))
+
+	inname=paste(goDivision,"_",input,sep="")	
+	rsq=read.table(inname,sep="\t",header=T)
+	rsq$term=as.factor(rsq$term)
+
+	mwut.t=TRUE
+	if (length(levels(as.factor(rsq$value)))==2) {
+		cat("Binary classification detected; will perform Fisher's test\n");
+		mwut.t=F
+		rr=fisherTest(rsq)
+	} else {
+		if (Module==TRUE) {
+			rsq.f=rsq
+			rsq.f$value=as.numeric(rsq.f$value>0)
+			rf=fisherTest(rsq.f)
+			rsq.m=rsq[rsq$value>0,]
+			rsq.m$term=factor(rsq.m$term,levels=unique(rsq.m$term))
+			rrm=mwuTest(rsq.m,"g")
+			rr0=rf[rf$term %in% rrm$term,]
+			rr1=rf[!(rf$term %in% rrm$term),]
+			rr0=rr0[order(rr0$term),]
+			rrm=rrm[order(rrm$term),]
+			rr0$pval=rr0$pval*rrm$pval
+			rr=rbind(rr0,rr1)
+		} else {
+			cat("Continuous measure of interest: will perform MWU test\n");		
+			rr=mwuTest(rsq,Alternative)
+		  }
+	}
+	
+	if (adjust.multcomp=="shuffle"){
+	 message("shuffling values to calculate FDR, ",shuffle.reps," reps")
+	 reps=shuffle.reps
+	 spv=c()
+	 for (i in 1:reps) {
+		 message("replicate ",i)
+		 rsqq=rsq
+		 rsqq$value=sample(rsq$value)
+		 if (Module==TRUE) {
+			 rsq.f=rsqq
+			 rsq.f$value=as.numeric(rsq.f$value>0)
+			 rf=fisherTest(rsq.f)
+			 rsq.m=rsqq[rsqq$value>0,]
+			 rsq.m$term=factor(rsq.m$term,levels=unique(rsq.m$term))
+			 rrm=mwuTest(rsq.m,"g")
+			 rr0=rf[rf$term %in% rrm$term,]
+			 rr1=rf[!(rf$term %in% rrm$term),]
+			 rr0=rr0[order(rr0$term),]
+			 rrm=rrm[order(rrm$term),]
+			 rr0$pval=rr0$pval*rrm$pval
+			 rs=rbind(rr0,rr1)
+		} else {
+		 if (mwut.t==TRUE) { rs=mwuTest(rsqq,Alternative) } else { rs=fisherTest(rsqq) }
+		}
+		 spv=append(spv,rs$pval)
+	 }
+	 fdr=c()
+	 for (p in rr$pval){
+		 fdr=append(fdr,(sum(spv<=p)/reps)/sum(rr$pval<=p))
+	 }
+	 fdr[fdr>1]=1
+	} else {
+	  fdr=p.adjust(rr$pval,method=adjust.multcomp)
+	}
+	 
+	message(sum(fdr<0.1)," GO terms at 10% FDR")
+	rr$p.adj=fdr
+	fname=paste("MWU_",inname,sep="")
+	write.table(rr,fname,row.names=F)
+}
+
+#---------------------
+mwuTest=function(gotable,Alternative) {
+	gos=gotable
+	terms=levels(gos$term)
+	gos$seq=as.character(gos$seq)
+	nrg=gos[!duplicated(gos$seq),5]
+	names(nrg)=gos[!duplicated(gos$seq),4]
+#	nrg=nrg+rnorm(nrg,sd=0.01) # to be able to do exact wilcox test
+	rnk=rank(nrg)
+	names(rnk)=names(nrg)
+	pvals=c();drs=c();nams=c();levs=c();nseqs=c()
+	for (t in terms){
+		got=gos[gos$term==t,]
+		got=got[!duplicated(got$seq),]
+		ngot=gos[gos$term!=t,]
+		ngot=ngot[!duplicated(ngot$seq),]
+		ngot=ngot[!(ngot$seq %in% got$seq),]
+		sgo.yes=got$seq
+		n1=length(sgo.yes)
+		sgo.no=ngot$seq
+		n2=length(sgo.no)
+#message(t," wilcox:",n1," ",n2)
+		wi=wilcox.test(nrg[sgo.yes],nrg[sgo.no],alternative=Alternative)	# removed correct=FALSE 
+		r1=sum(rnk[sgo.yes])/n1
+		r0=sum(rnk[sgo.no])/n2
+		dr=r1-r0
+		drs=append(drs,round(dr,0))
+		levs=append(levs,got$lev[1])
+		nams=append(nams,as.character(got$name[1]))
+		pvals=append(pvals,wi$p.value)
+		nseqs=append(nseqs,n1)	
+	}
+	res=data.frame(cbind("delta.rank"=drs,"pval"=pvals,"level"=levs,nseqs))
+	res=cbind(res,"term"=as.character(terms),"name"=nams)
+	res$pval=as.numeric(as.character(res$pval))
+	res$delta.rank=as.numeric(as.character(res$delta.rank))
+	res$level=as.numeric(as.character(res$level))
+	res$nseqs=as.numeric(as.character(res$nseqs))
+	return(res)
+}
+#------------------------
+fisherTest=function(gotable) {
+	gos=gotable
+	terms=levels(gos$term)
+	gos$seq=as.character(gos$seq)
+	pft=c();nam=c();lev=c();nseqs=c()
+	for (t in terms) {
+		got=gos[gos$term==t,]
+		got=got[!duplicated(got$seq),]
+		ngot=gos[gos$term!=t,]
+		ngot=ngot[!duplicated(ngot$seq),]
+		ngot=ngot[!(ngot$seq %in% got$seq),]
+		go.sig=sum(got$value)
+		go.ns=length(got[,1])-go.sig
+		ngo.sig=sum(ngot$value)
+		ngo.ns=length(ngot[,1])-ngo.sig
+		sig=c(go.sig,ngo.sig) # number of significant genes belonging and not belonging to the tested GO category
+		ns=c(go.ns,ngo.ns) # number of not-significant genes belonging and not belonging to the tested GO category
+		mm=matrix(c(sig,ns),nrow=2,dimnames=list(ns=c("go","notgo"),sig=c("go","notgo")))
+		ff=fisher.test(mm,alternative="greater")
+		pft=append(pft,ff$p.value)
+		nam=append(nam,as.character(got$name[1]))
+		lev=append(lev,got$lev[1])
+		nseqs=append(nseqs,length(got[,1]))
+	}
+	res=data.frame(cbind("delta.rank"=rep(0),"pval"=pft,"level"=lev,nseqs,"term"=terms,"name"=nam))
+	res[,1]=as.numeric(as.character(res[,1]))
+	res[,2]=as.numeric(as.character(res[,2]))
+	res[,3]=as.numeric(as.character(res[,3]))
+	res$nseqs=as.numeric(as.character(res$nseqs))
+	return(res)
+}
+
+#-------------------------
+gomwuPlot=function(inFile,goAnnotations,goDivision,level1=0.1,level2=0.05,level3=0.01,absValue=-log(0.05,10),adjusted=TRUE,txtsize=1,font.family="sans",treeHeight=0.5,colors=NULL) {
+	require(ape)
+	
+	input=inFile
+	in.mwu=paste("MWU",goDivision,input,sep="_")
+	in.dissim=paste("dissim",goDivision,input,goAnnotations,sep="_")
+	
+	cutoff=-log(level1,10)
+	pv=read.table(in.mwu,header=T)
+	row.names(pv)=pv$term
+	in.raw=paste(goDivision,input,sep="_")
+	rsq=read.table(in.raw,sep="\t",header=T)
+	rsq$term=as.factor(rsq$term)
+	
+	if (adjusted==TRUE) { pvals=pv$p.adj } else { pvals=pv$pval }
+	heat=data.frame(cbind("pval"=pvals)) 
+	row.names(heat)=pv$term
+	heat$pval=-log(heat$pval+1e-15,10)
+	heat$direction=0
+	heat$direction[pv$delta.rank>0]=1
+	if (cutoff>0) { 
+		goods=subset(heat,pval>=cutoff) 
+	} else {
+		goods.names=unique(rsq$term[abs(rsq$value)>=absValue])
+		goods=heat[row.names(heat) %in% goods.names,]
+	}
+	
+	if (is.null(colors) | length(colors)<4 ) {
+		colors=c("dodgerblue2","firebrick1","skyblue2","lightcoral")
+		if (sum(goods$direction)==nrow(goods) | sum(goods$direction)==0) { 
+			colors=c("black","black","grey50","grey50")
+		}
+	}
+	goods.names=row.names(goods)
+	
+	# reading and subsetting dissimilarity matrix
+	diss=read.table(in.dissim,sep="\t",header=T,check.names=F)
+	row.names(diss)=names(diss)
+	diss.goods=diss[goods.names,goods.names]
+	
+	# how many genes out of what we started with we account for with our best categories?
+	good.len=c();good.genes=c()
+	for (g in goods.names) {
+		sel=rsq[rsq$term==g,]	
+		pass=abs(sel$value)>=absValue
+		sel=sel[pass,]
+		good.genes=append(good.genes,as.character(sel$seq))
+		good.len=append(good.len,nrow(sel))
+	}
+	ngenes=length(unique(good.genes))
+	
+	#hist(rsq$value)
+	totSum=length(unique(rsq$seq[abs(rsq$value)>=absValue]))
+	row.names(goods)=paste(good.len,"/",pv[pv$term %in% goods.names,]$nseqs," ",pv[pv$term %in% goods.names,]$name,sep="")
+	row.names(heat)=paste(good.len,"/",pv$nseqs," ",pv$name,sep="")
+	row.names(diss.goods)=paste(good.len,"/",pv[pv$term %in% goods.names,]$nseqs," ",pv[pv$term %in% goods.names,]$name,sep="")
+	
+	# clustering terms better than cutoff
+	GO.categories=as.dist(diss.goods)
+	cl.goods=hclust(GO.categories,method="average")
+	labs=cl.goods$labels[cl.goods$order] # saving the labels to order the plot
+	goods=goods[labs,]
+	labs=sub(" activity","",labs)
+
+	old.par <- par( no.readonly = TRUE )
+
+	plots=layout(matrix(c(1,2,3),1,3,byrow=T),c(treeHeight,3,1),TRUE)
+
+    par(mar = c(2,2,0.85,0))
+	plot(as.phylo(cl.goods),show.tip.label=FALSE,cex=0.0000001)
+	step=100
+	left=1
+	top=step*(2+length(labs))
+
+    par(mar = c(0,0,0.3,0))
+	plot(c(1:top)~c(1:top),type="n",axes=F,xlab="",ylab="")
+	ii=1
+	goods$color=1
+	goods$color[goods$direction==1 & goods$pval>cutoff]=colors[4]
+	goods$color[goods$direction==0 & goods$pval>cutoff]=colors[3]
+	goods$color[goods$direction==1 & goods$pval>(-log(level2,10))]=colors[2]
+	goods$color[goods$direction==0 & goods$pval>(-log(level2,10))]=colors[1]
+	goods$color[goods$direction==1 & goods$pval>(-log(level3,10))]=colors[2]
+	goods$color[goods$direction==0 & goods$pval>(-log(level3,10))]=colors[1]
+	for (i in length(labs):1) {
+		ypos=top-step*ii
+		ii=ii+1
+		if (goods$pval[i]> -log(level3,10)) { 
+			text(left,ypos,labs[i],font=2,cex=1*txtsize,col=goods$color[i],adj=c(0,0),family=font.family) 
+		} else {
+			if (goods$pval[i]>-log(level2,10)) { 
+				text(left,ypos,labs[i],font=1,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family)
+			} else {
+	#			if (goods$pval[i]>cutoff) { 
+	#				text(left,ypos,labs[i],font=3,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family)
+		#		} else { 
+			text(left,ypos,labs[i],font=3,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family) 
+			#}
+			}
+		}
+	}
+	
+    par(mar = c(3,1,1,0))
+	
+	plot(c(1:top)~c(1:top),type="n",axes=F,xlab="",ylab="")
+	text(left,top-step*2,paste("p < ",level3,sep=""),font=2,cex=1* txtsize,adj=c(0,0),family=font.family)
+	text(left,top-step*3,paste("p < ",level2,sep=""),font=1,cex=0.8* txtsize,adj=c(0,0),family=font.family)
+	text(left,top-step*4,paste("p < ",10^(-cutoff),sep=""),font=3,col="grey50",cex=0.8* txtsize,adj=c(0,0),family=font.family)
+	
+	message("GO terms dispayed: ",length(goods.names))
+	message("\"Good genes\" accounted for:  ", ngenes," out of ",totSum, " ( ",round(100*ngenes/totSum,0), "% )")
+	par(old.par)	
+	goods$pval=10^(-1*goods$pval)
+	return(list(goods,cl.goods))
+}
+
+#------------------
+# returns non-overlapping GO categories based on dissimilarity table
+indepGO=function(dissim.table,min.similarity=1) {
+	tt=read.table(dissim.table,sep="\t", header=TRUE)
+	tt=as.matrix(tt)
+	diag(tt)=1
+	for (i in 1:ncol(tt)) {	
+		mins=apply(tt,2,min)
+		if (min(mins)>=min.similarity) { break }
+		sums=apply(tt,2,sum)
+		worst=which(sums==min(sums))[1]
+		# cat("\n",worsts,"\n")
+		# gw=c()
+		# for(w in worsts) { gw=append(gw,sum(tt[,w]==1)) }
+		# cat(gw)
+		# worst=worsts[gw==min(gw)]
+#		cat("\n",i," removing",worst,"; sum =",sums[worst])
+		tt=tt[-worst,-worst]		
+		mins=mins[-worst]		
+#		cat(" new min =",min(mins))
+	}
+	goods=colnames(tt)
+	goods=gsub("GO\\.","GO:",goods)
+	goods=gsub("\\.GO",";GO",goods)
+}
+