Mercurial > repos > cristian > rbgoa
comparison gomwu.functions.R @ 2:5acf9dfdfa27 draft default tip
planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
author | cristian |
---|---|
date | Wed, 09 Nov 2022 08:57:54 +0000 |
parents | f7287f82602f |
children |
comparison
equal
deleted
inserted
replaced
1:f7287f82602f | 2:5acf9dfdfa27 |
---|---|
1 clusteringGOs=function(gen2go,div,cutHeight) { | 1 clusteringGOs <- function(gen2go, div, cutHeight) { |
2 nn <- strsplit(gen2go, "[/.]") | 2 nn <- strsplit(gen2go, "[/.]") |
3 if (length(nn[[1]]) == 3) { | 3 if (length(nn[[1]]) == 3) { |
4 dir <- nn[[1]][1] | 4 dir <- nn[[1]][1] |
5 name <- nn[[1]][2] | 5 name <- nn[[1]][2] |
6 ext <- nn[[1]][3] | 6 ext <- nn[[1]][3] |
7 } else if (length(nn[[1]]) == 2) { | 7 } else if (length(nn[[1]]) == 2) { |
8 dir <- "." | 8 dir <- "." |
9 name <- nn[[1]][1] | 9 name <- nn[[1]][1] |
10 ext <- nn[[1]][2] | 10 ext <- nn[[1]][2] |
11 } | 11 } |
12 inname=paste(dir,"/","dissim0_",div,"_",name,".",ext,sep="") | 12 inname <- paste(dir, "/", "dissim0_", div, "_", name, ".", ext, sep = "") |
13 | 13 |
14 outname=paste(dir,"/","cl_dissim0_",div,"_",name,".",ext,sep="") | 14 outname <- paste(dir, "/", "cl_dissim0_", div, "_", name, ".", ext, sep = "") |
15 if (!file.exists(outname)) { | 15 if (!file.exists(outname)) { |
16 diss=read.table(inname,sep="\t",header=T,check.names=F) | 16 diss <- read.table(inname, sep = "\t", header = T, check.names = F) |
17 row.names(diss)=names(diss) | 17 row.names(diss) <- names(diss) |
18 hc=hclust(as.dist(diss),method="complete") | 18 hc <- hclust(as.dist(diss), method = "complete") |
19 cc=cutree(hc,h=cutHeight) | 19 cc <- cutree(hc, h = cutHeight) |
20 write.csv(cc,file=outname,quote=F) | 20 write.csv(cc, file = outname, quote = F) |
21 } | 21 } |
22 } | 22 } |
23 | 23 |
24 | 24 |
25 | 25 |
26 #--------------- | 26 #--------------- |
27 gomwuStats=function(input,goDatabase,goAnnotations, goDivision, scriptdir, Module=FALSE, Alternative="t", adjust.multcomp="BH", clusterCutHeight=0.25,largest=0.1,smallest=5,perlPath="perl", shuffle.reps=20){ | 27 gomwuStats <- function(input, goDatabase, goAnnotations, goDivision, scriptdir, Module = FALSE, Alternative = "t", adjust.multcomp = "BH", clusterCutHeight = 0.25, largest = 0.1, smallest = 5, perlPath = "perl", shuffle.reps = 20) { |
28 | |
29 nn <- strsplit(input, "[/.]") | 28 nn <- strsplit(input, "[/.]") |
30 if (length(nn[[1]]) == 3) { | 29 if (length(nn[[1]]) == 3) { |
31 dir <- nn[[1]][1] | 30 dir <- nn[[1]][1] |
32 name <- nn[[1]][2] | 31 name <- nn[[1]][2] |
33 ext <- nn[[1]][3] | 32 ext <- nn[[1]][3] |
35 dir <- "." | 34 dir <- "." |
36 name <- nn[[1]][1] | 35 name <- nn[[1]][1] |
37 ext <- nn[[1]][2] | 36 ext <- nn[[1]][2] |
38 } | 37 } |
39 | 38 |
40 extraOptions=paste("largest=",largest," smallest=",smallest," cutHeight=",clusterCutHeight,sep="") | 39 extraOptions <- paste("largest=", largest, " smallest=", smallest, " cutHeight=", clusterCutHeight, sep = "") |
41 if (Module==TRUE) { adjust.multcomp="shuffle" } | 40 if (Module == TRUE) { |
42 gomwu_a = paste(scriptdir, "gomwu_a.pl", sep="/") | 41 adjust.multcomp <- "shuffle" |
43 gomwu_b = paste(scriptdir, "gomwu_b.pl", sep="/") | 42 } |
44 system(paste(perlPath, gomwu_a,goDatabase,goAnnotations,input,goDivision,extraOptions)) | 43 gomwu_a <- paste(scriptdir, "gomwu_a.pl", sep = "/") |
45 clusteringGOs(goAnnotations,goDivision,clusterCutHeight) | 44 gomwu_b <- paste(scriptdir, "gomwu_b.pl", sep = "/") |
46 system(paste(perlPath, gomwu_b,goAnnotations,input,goDivision)) | 45 system(paste(perlPath, gomwu_a, goDatabase, goAnnotations, input, goDivision, extraOptions)) |
47 | 46 clusteringGOs(goAnnotations, goDivision, clusterCutHeight) |
48 inname=paste(dir,"/",name,"_",goDivision,".tsv",sep="") | 47 system(paste(perlPath, gomwu_b, goAnnotations, input, goDivision)) |
49 rsq=read.table(inname,sep="\t",header=T) | 48 |
50 rsq$term=as.factor(rsq$term) | 49 inname <- paste(dir, "/", name, "_", goDivision, ".tsv", sep = "") |
51 | 50 rsq <- read.table(inname, sep = "\t", header = T) |
52 mwut.t=TRUE | 51 rsq$term <- as.factor(rsq$term) |
53 if (length(levels(as.factor(rsq$value)))==2) { | 52 |
54 cat("Binary classification detected; will perform Fisher's test\n"); | 53 mwut.t <- TRUE |
55 mwut.t=F | 54 if (length(levels(as.factor(rsq$value))) == 2) { |
56 rr=fisherTest(rsq) | 55 cat("Binary classification detected; will perform Fisher's test\n") |
57 } else { | 56 mwut.t <- F |
58 if (Module==TRUE) { | 57 rr <- fisherTest(rsq) |
59 rsq.f=rsq | 58 } else { |
60 rsq.f$value=as.numeric(rsq.f$value>0) | 59 if (Module == TRUE) { |
61 rf=fisherTest(rsq.f) | 60 rsq.f <- rsq |
62 rsq.m=rsq[rsq$value>0,] | 61 rsq.f$value <- as.numeric(rsq.f$value > 0) |
63 rsq.m$term=factor(rsq.m$term,levels=unique(rsq.m$term)) | 62 rf <- fisherTest(rsq.f) |
64 rrm=mwuTest(rsq.m,"g") | 63 rsq.m <- rsq[rsq$value > 0, ] |
65 rr0=rf[rf$term %in% rrm$term,] | 64 rsq.m$term <- factor(rsq.m$term, levels = unique(rsq.m$term)) |
66 rr1=rf[!(rf$term %in% rrm$term),] | 65 rrm <- mwuTest(rsq.m, "g") |
67 rr0=rr0[order(rr0$term),] | 66 rr0 <- rf[rf$term %in% rrm$term, ] |
68 rrm=rrm[order(rrm$term),] | 67 rr1 <- rf[!(rf$term %in% rrm$term), ] |
69 rr0$pval=rr0$pval*rrm$pval | 68 rr0 <- rr0[order(rr0$term), ] |
70 rr=rbind(rr0,rr1) | 69 rrm <- rrm[order(rrm$term), ] |
71 } else { | 70 rr0$pval <- rr0$pval * rrm$pval |
72 cat("Continuous measure of interest: will perform MWU test\n"); | 71 rr <- rbind(rr0, rr1) |
73 rr=mwuTest(rsq,Alternative) | 72 } else { |
74 } | 73 cat("Continuous measure of interest: will perform MWU test\n") |
75 } | 74 rr <- mwuTest(rsq, Alternative) |
76 | 75 } |
77 if (adjust.multcomp=="shuffle"){ | 76 } |
78 message("shuffling values to calculate FDR, ",shuffle.reps," reps") | 77 |
79 reps=shuffle.reps | 78 if (adjust.multcomp == "shuffle") { |
80 spv=c() | 79 message("shuffling values to calculate FDR, ", shuffle.reps, " reps") |
81 for (i in 1:reps) { | 80 reps <- shuffle.reps |
82 message("replicate ",i) | 81 spv <- c() |
83 rsqq=rsq | 82 for (i in 1:reps) { |
84 rsqq$value=sample(rsq$value) | 83 message("replicate ", i) |
85 if (Module==TRUE) { | 84 rsqq <- rsq |
86 rsq.f=rsqq | 85 rsqq$value <- sample(rsq$value) |
87 rsq.f$value=as.numeric(rsq.f$value>0) | 86 if (Module == TRUE) { |
88 rf=fisherTest(rsq.f) | 87 rsq.f <- rsqq |
89 rsq.m=rsqq[rsqq$value>0,] | 88 rsq.f$value <- as.numeric(rsq.f$value > 0) |
90 rsq.m$term=factor(rsq.m$term,levels=unique(rsq.m$term)) | 89 rf <- fisherTest(rsq.f) |
91 rrm=mwuTest(rsq.m,"g") | 90 rsq.m <- rsqq[rsqq$value > 0, ] |
92 rr0=rf[rf$term %in% rrm$term,] | 91 rsq.m$term <- factor(rsq.m$term, levels = unique(rsq.m$term)) |
93 rr1=rf[!(rf$term %in% rrm$term),] | 92 rrm <- mwuTest(rsq.m, "g") |
94 rr0=rr0[order(rr0$term),] | 93 rr0 <- rf[rf$term %in% rrm$term, ] |
95 rrm=rrm[order(rrm$term),] | 94 rr1 <- rf[!(rf$term %in% rrm$term), ] |
96 rr0$pval=rr0$pval*rrm$pval | 95 rr0 <- rr0[order(rr0$term), ] |
97 rs=rbind(rr0,rr1) | 96 rrm <- rrm[order(rrm$term), ] |
98 } else { | 97 rr0$pval <- rr0$pval * rrm$pval |
99 if (mwut.t==TRUE) { rs=mwuTest(rsqq,Alternative) } else { rs=fisherTest(rsqq) } | 98 rs <- rbind(rr0, rr1) |
100 } | 99 } else { |
101 spv=append(spv,rs$pval) | 100 if (mwut.t == TRUE) { |
102 } | 101 rs <- mwuTest(rsqq, Alternative) |
103 fdr=c() | 102 } else { |
104 for (p in rr$pval){ | 103 rs <- fisherTest(rsqq) |
105 fdr=append(fdr,(sum(spv<=p)/reps)/sum(rr$pval<=p)) | 104 } |
106 } | 105 } |
107 fdr[fdr>1]=1 | 106 spv <- append(spv, rs$pval) |
108 } else { | 107 } |
109 fdr=p.adjust(rr$pval,method=adjust.multcomp) | 108 fdr <- c() |
110 } | 109 for (p in rr$pval) { |
111 | 110 fdr <- append(fdr, (sum(spv <= p) / reps) / sum(rr$pval <= p)) |
112 message(sum(fdr<0.1)," GO terms at 10% FDR") | 111 } |
113 rr$p.adj=fdr | 112 fdr[fdr > 1] <- 1 |
114 fname=paste(dir,"/","MWU_",goDivision,"_",name,".",ext,sep="") | 113 } else { |
115 write.table(rr,fname,row.names=F) | 114 fdr <- p.adjust(rr$pval, method = adjust.multcomp) |
115 } | |
116 | |
117 message(sum(fdr < 0.1), " GO terms at 10% FDR") | |
118 rr$p.adj <- fdr | |
119 fname <- paste(dir, "/", "MWU_", goDivision, "_", name, ".", ext, sep = "") | |
120 write.table(rr, fname, sep = "\t", row.names = F) | |
116 } | 121 } |
117 | 122 |
118 #--------------------- | 123 #--------------------- |
119 mwuTest=function(gotable,Alternative) { | 124 mwuTest <- function(gotable, Alternative) { |
120 gos=gotable | 125 gos <- gotable |
121 terms=levels(gos$term) | 126 terms <- levels(gos$term) |
122 gos$seq=as.character(gos$seq) | 127 gos$seq <- as.character(gos$seq) |
123 nrg=gos[!duplicated(gos$seq),5] | 128 nrg <- gos[!duplicated(gos$seq), 5] |
124 names(nrg)=gos[!duplicated(gos$seq),4] | 129 names(nrg) <- gos[!duplicated(gos$seq), 4] |
125 # nrg=nrg+rnorm(nrg,sd=0.01) # to be able to do exact wilcox test | 130 # nrg=nrg+rnorm(nrg,sd=0.01) # to be able to do exact wilcox test |
126 rnk=rank(nrg) | 131 rnk <- rank(nrg) |
127 names(rnk)=names(nrg) | 132 names(rnk) <- names(nrg) |
128 pvals=c();drs=c();nams=c();levs=c();nseqs=c() | 133 pvals <- c() |
129 for (t in terms){ | 134 drs <- c() |
130 got=gos[gos$term==t,] | 135 nams <- c() |
131 got=got[!duplicated(got$seq),] | 136 levs <- c() |
132 ngot=gos[gos$term!=t,] | 137 nseqs <- c() |
133 ngot=ngot[!duplicated(ngot$seq),] | 138 for (t in terms) { |
134 ngot=ngot[!(ngot$seq %in% got$seq),] | 139 got <- gos[gos$term == t, ] |
135 sgo.yes=got$seq | 140 got <- got[!duplicated(got$seq), ] |
136 n1=length(sgo.yes) | 141 ngot <- gos[gos$term != t, ] |
137 sgo.no=ngot$seq | 142 ngot <- ngot[!duplicated(ngot$seq), ] |
138 n2=length(sgo.no) | 143 ngot <- ngot[!(ngot$seq %in% got$seq), ] |
139 #message(t," wilcox:",n1," ",n2) | 144 sgo.yes <- got$seq |
140 wi=wilcox.test(nrg[sgo.yes],nrg[sgo.no],alternative=Alternative) # removed correct=FALSE | 145 n1 <- length(sgo.yes) |
141 r1=sum(rnk[sgo.yes])/n1 | 146 sgo.no <- ngot$seq |
142 r0=sum(rnk[sgo.no])/n2 | 147 n2 <- length(sgo.no) |
143 dr=r1-r0 | 148 # message(t," wilcox:",n1," ",n2) |
144 drs=append(drs,round(dr,0)) | 149 wi <- wilcox.test(nrg[sgo.yes], nrg[sgo.no], alternative = Alternative) # removed correct=FALSE |
145 levs=append(levs,got$lev[1]) | 150 r1 <- sum(rnk[sgo.yes]) / n1 |
146 nams=append(nams,as.character(got$name[1])) | 151 r0 <- sum(rnk[sgo.no]) / n2 |
147 pvals=append(pvals,wi$p.value) | 152 dr <- r1 - r0 |
148 nseqs=append(nseqs,n1) | 153 drs <- append(drs, round(dr, 0)) |
149 } | 154 levs <- append(levs, got$lev[1]) |
150 res=data.frame(cbind("delta.rank"=drs,"pval"=pvals,"level"=levs,nseqs)) | 155 nams <- append(nams, as.character(got$name[1])) |
151 res=cbind(res,"term"=as.character(terms),"name"=nams) | 156 pvals <- append(pvals, wi$p.value) |
152 res$pval=as.numeric(as.character(res$pval)) | 157 nseqs <- append(nseqs, n1) |
153 res$delta.rank=as.numeric(as.character(res$delta.rank)) | 158 } |
154 res$level=as.numeric(as.character(res$level)) | 159 res <- data.frame(cbind("delta.rank" = drs, "pval" = pvals, "level" = levs, nseqs)) |
155 res$nseqs=as.numeric(as.character(res$nseqs)) | 160 res <- cbind(res, "term" = as.character(terms), "name" = nams) |
156 return(res) | 161 res$pval <- as.numeric(as.character(res$pval)) |
162 res$delta.rank <- as.numeric(as.character(res$delta.rank)) | |
163 res$level <- as.numeric(as.character(res$level)) | |
164 res$nseqs <- as.numeric(as.character(res$nseqs)) | |
165 return(res) | |
157 } | 166 } |
158 #------------------------ | 167 #------------------------ |
159 fisherTest=function(gotable) { | 168 fisherTest <- function(gotable) { |
160 gos=gotable | 169 gos <- gotable |
161 terms=levels(gos$term) | 170 terms <- levels(gos$term) |
162 gos$seq=as.character(gos$seq) | 171 gos$seq <- as.character(gos$seq) |
163 pft=c();nam=c();lev=c();nseqs=c() | 172 pft <- c() |
164 for (t in terms) { | 173 nam <- c() |
165 got=gos[gos$term==t,] | 174 lev <- c() |
166 got=got[!duplicated(got$seq),] | 175 nseqs <- c() |
167 ngot=gos[gos$term!=t,] | 176 for (t in terms) { |
168 ngot=ngot[!duplicated(ngot$seq),] | 177 got <- gos[gos$term == t, ] |
169 ngot=ngot[!(ngot$seq %in% got$seq),] | 178 got <- got[!duplicated(got$seq), ] |
170 go.sig=sum(got$value) | 179 ngot <- gos[gos$term != t, ] |
171 go.ns=length(got[,1])-go.sig | 180 ngot <- ngot[!duplicated(ngot$seq), ] |
172 ngo.sig=sum(ngot$value) | 181 ngot <- ngot[!(ngot$seq %in% got$seq), ] |
173 ngo.ns=length(ngot[,1])-ngo.sig | 182 go.sig <- sum(got$value) |
174 sig=c(go.sig,ngo.sig) # number of significant genes belonging and not belonging to the tested GO category | 183 go.ns <- length(got[, 1]) - go.sig |
175 ns=c(go.ns,ngo.ns) # number of not-significant genes belonging and not belonging to the tested GO category | 184 ngo.sig <- sum(ngot$value) |
176 mm=matrix(c(sig,ns),nrow=2,dimnames=list(ns=c("go","notgo"),sig=c("go","notgo"))) | 185 ngo.ns <- length(ngot[, 1]) - ngo.sig |
177 ff=fisher.test(mm,alternative="greater") | 186 sig <- c(go.sig, ngo.sig) # number of significant genes belonging and not belonging to the tested GO category |
178 pft=append(pft,ff$p.value) | 187 ns <- c(go.ns, ngo.ns) # number of not-significant genes belonging and not belonging to the tested GO category |
179 nam=append(nam,as.character(got$name[1])) | 188 mm <- matrix(c(sig, ns), nrow = 2, dimnames = list(ns = c("go", "notgo"), sig = c("go", "notgo"))) |
180 lev=append(lev,got$lev[1]) | 189 ff <- fisher.test(mm, alternative = "greater") |
181 nseqs=append(nseqs,length(got[,1])) | 190 pft <- append(pft, ff$p.value) |
182 } | 191 nam <- append(nam, as.character(got$name[1])) |
183 res=data.frame(cbind("delta.rank"=rep(0),"pval"=pft,"level"=lev,nseqs,"term"=terms,"name"=nam)) | 192 lev <- append(lev, got$lev[1]) |
184 res[,1]=as.numeric(as.character(res[,1])) | 193 nseqs <- append(nseqs, length(got[, 1])) |
185 res[,2]=as.numeric(as.character(res[,2])) | 194 } |
186 res[,3]=as.numeric(as.character(res[,3])) | 195 res <- data.frame(cbind("delta.rank" = rep(0), "pval" = pft, "level" = lev, nseqs, "term" = terms, "name" = nam)) |
187 res$nseqs=as.numeric(as.character(res$nseqs)) | 196 res[, 1] <- as.numeric(as.character(res[, 1])) |
188 return(res) | 197 res[, 2] <- as.numeric(as.character(res[, 2])) |
198 res[, 3] <- as.numeric(as.character(res[, 3])) | |
199 res$nseqs <- as.numeric(as.character(res$nseqs)) | |
200 return(res) | |
189 } | 201 } |
190 | 202 |
191 #------------------------- | 203 #------------------------- |
192 gomwuPlot=function(inFile,goAnnotations,goDivision,level1=0.1,level2=0.05,level3=0.01,absValue=-log(0.05,10),adjusted=TRUE,txtsize=1,font.family="sans",treeHeight=0.5,colors=NULL) { | 204 gomwuPlot <- function(inFile, goAnnotations, goDivision, level1 = 0.1, level2 = 0.05, level3 = 0.01, absValue = -log(0.05, 10), adjusted = TRUE, txtsize = 1, font.family = "sans", treeHeight = 0.5, colors = NULL) { |
193 require(ape) | 205 require(ape) |
194 | 206 |
195 nn <- strsplit(inFile, "[/.]") | 207 nn <- strsplit(inFile, "[/.]") |
196 if (length(nn[[1]]) == 3) { | 208 if (length(nn[[1]]) == 3) { |
197 dir <- nn[[1]][1] | 209 dir <- nn[[1]][1] |
198 name <- nn[[1]][2] | 210 name <- nn[[1]][2] |
199 ext <- nn[[1]][3] | 211 ext <- nn[[1]][3] |
210 } else if (length(aa[[1]]) == 2) { | 222 } else if (length(aa[[1]]) == 2) { |
211 adir <- "." | 223 adir <- "." |
212 aname <- aa[[1]][1] | 224 aname <- aa[[1]][1] |
213 aext <- aa[[1]][2] | 225 aext <- aa[[1]][2] |
214 } | 226 } |
215 # input=inFile | 227 # input=inFile |
216 in.mwu=paste(dir,"/",paste("MWU",goDivision,name,sep="_"), ".", ext,sep="") | 228 in.mwu <- paste(dir, "/", paste("MWU", goDivision, name, sep = "_"), ".", ext, sep = "") |
217 in.dissim=paste(dir, "/", paste("dissim",goDivision,name,aname,sep="_"), ".", aext, sep="") | 229 in.dissim <- paste(dir, "/", paste("dissim", goDivision, name, aname, sep = "_"), ".", aext, sep = "") |
218 | 230 |
219 cutoff=-log(level1,10) | 231 cutoff <- -log(level1, 10) |
220 pv=read.table(in.mwu,header=T) | 232 pv <- read.table(in.mwu, header = T) |
221 row.names(pv)=pv$term | 233 row.names(pv) <- pv$term |
222 in.raw=paste(dir,"/",paste(name,goDivision,sep="_"), ".tsv", sep="") | 234 in.raw <- paste(dir, "/", paste(name, goDivision, sep = "_"), ".tsv", sep = "") |
223 rsq=read.table(in.raw,sep="\t",header=T) | 235 rsq <- read.table(in.raw, sep = "\t", header = T) |
224 rsq$term=as.factor(rsq$term) | 236 rsq$term <- as.factor(rsq$term) |
225 | 237 |
226 if (adjusted==TRUE) { pvals=pv$p.adj } else { pvals=pv$pval } | 238 if (adjusted == TRUE) { |
227 heat=data.frame(cbind("pval"=pvals)) | 239 pvals <- pv$p.adj |
228 row.names(heat)=pv$term | 240 } else { |
229 heat$pval=-log(heat$pval+1e-15,10) | 241 pvals <- pv$pval |
230 heat$direction=0 | 242 } |
231 heat$direction[pv$delta.rank>0]=1 | 243 heat <- data.frame(cbind("pval" = pvals)) |
232 if (cutoff>0) { | 244 row.names(heat) <- pv$term |
233 goods=subset(heat,pval>=cutoff) | 245 heat$pval <- -log(heat$pval + 1e-15, 10) |
234 } else { | 246 heat$direction <- 0 |
235 goods.names=unique(rsq$term[abs(rsq$value)>=absValue]) | 247 heat$direction[pv$delta.rank > 0] <- 1 |
236 goods=heat[row.names(heat) %in% goods.names,] | 248 if (cutoff > 0) { |
237 } | 249 goods <- subset(heat, pval >= cutoff) |
238 | 250 } else { |
239 if (is.null(colors) | length(colors)<4 ) { | 251 goods.names <- unique(rsq$term[abs(rsq$value) >= absValue]) |
240 colors=c("dodgerblue2","firebrick1","skyblue2","lightcoral") | 252 goods <- heat[row.names(heat) %in% goods.names, ] |
241 if (sum(goods$direction)==nrow(goods) | sum(goods$direction)==0) { | 253 } |
242 colors=c("black","black","grey50","grey50") | 254 |
243 } | 255 if (is.null(colors) | length(colors) < 4) { |
244 } | 256 colors <- c("dodgerblue2", "firebrick1", "skyblue2", "lightcoral") |
245 goods.names=row.names(goods) | 257 if (sum(goods$direction) == nrow(goods) | sum(goods$direction) == 0) { |
246 | 258 colors <- c("black", "black", "grey50", "grey50") |
247 # reading and subsetting dissimilarity matrix | 259 } |
248 diss=read.table(in.dissim,sep="\t",header=T,check.names=F) | 260 } |
249 row.names(diss)=names(diss) | 261 goods.names <- row.names(goods) |
250 diss.goods=diss[goods.names,goods.names] | 262 |
251 | 263 # reading and subsetting dissimilarity matrix |
252 # how many genes out of what we started with we account for with our best categories? | 264 diss <- read.table(in.dissim, sep = "\t", header = T, check.names = F) |
253 good.len=c();good.genes=c() | 265 row.names(diss) <- names(diss) |
254 for (g in goods.names) { | 266 diss.goods <- diss[goods.names, goods.names] |
255 sel=rsq[rsq$term==g,] | 267 |
256 pass=abs(sel$value)>=absValue | 268 # how many genes out of what we started with we account for with our best categories? |
257 sel=sel[pass,] | 269 good.len <- c() |
258 good.genes=append(good.genes,as.character(sel$seq)) | 270 good.genes <- c() |
259 good.len=append(good.len,nrow(sel)) | 271 for (g in goods.names) { |
260 } | 272 sel <- rsq[rsq$term == g, ] |
261 ngenes=length(unique(good.genes)) | 273 pass <- abs(sel$value) >= absValue |
262 | 274 sel <- sel[pass, ] |
263 #hist(rsq$value) | 275 good.genes <- append(good.genes, as.character(sel$seq)) |
264 totSum=length(unique(rsq$seq[abs(rsq$value)>=absValue])) | 276 good.len <- append(good.len, nrow(sel)) |
265 row.names(goods)=paste(good.len,"/",pv[pv$term %in% goods.names,]$nseqs," ",pv[pv$term %in% goods.names,]$name,sep="") | 277 } |
266 row.names(heat)=paste(good.len,"/",pv$nseqs," ",pv$name,sep="") | 278 ngenes <- length(unique(good.genes)) |
267 row.names(diss.goods)=paste(good.len,"/",pv[pv$term %in% goods.names,]$nseqs," ",pv[pv$term %in% goods.names,]$name,sep="") | 279 |
268 | 280 # hist(rsq$value) |
269 # clustering terms better than cutoff | 281 totSum <- length(unique(rsq$seq[abs(rsq$value) >= absValue])) |
270 GO.categories=as.dist(diss.goods) | 282 row.names(goods) <- paste(good.len, "/", pv[pv$term %in% goods.names, ]$nseqs, " ", pv[pv$term %in% goods.names, ]$name, sep = "") |
271 cl.goods=hclust(GO.categories,method="average") | 283 row.names(heat) <- paste(good.len, "/", pv$nseqs, " ", pv$name, sep = "") |
272 labs=cl.goods$labels[cl.goods$order] # saving the labels to order the plot | 284 row.names(diss.goods) <- paste(good.len, "/", pv[pv$term %in% goods.names, ]$nseqs, " ", pv[pv$term %in% goods.names, ]$name, sep = "") |
273 goods=goods[labs,] | 285 |
274 labs=sub(" activity","",labs) | 286 # clustering terms better than cutoff |
275 | 287 GO.categories <- as.dist(diss.goods) |
276 old.par <- par( no.readonly = TRUE ) | 288 cl.goods <- hclust(GO.categories, method = "average") |
277 | 289 labs <- cl.goods$labels[cl.goods$order] # saving the labels to order the plot |
278 plots=layout(matrix(c(1,2,3),1,3,byrow=T),c(treeHeight,3,1),TRUE) | 290 goods <- goods[labs, ] |
279 | 291 labs <- sub(" activity", "", labs) |
280 par(mar = c(2,2,0.85,0)) | 292 |
281 plot(as.phylo(cl.goods),show.tip.label=FALSE,cex=0.0000001) | 293 old.par <- par(no.readonly = TRUE) |
282 step=100 | 294 |
283 left=1 | 295 plots <- layout(matrix(c(1, 2, 3), 1, 3, byrow = T), c(treeHeight, 3, 1), TRUE) |
284 top=step*(2+length(labs)) | 296 |
285 | 297 par(mar = c(2, 2, 0.85, 0)) |
286 par(mar = c(0,0,0.3,0)) | 298 plot(as.phylo(cl.goods), show.tip.label = FALSE, cex = 0.0000001) |
287 plot(c(1:top)~c(1:top),type="n",axes=F,xlab="",ylab="") | 299 |
288 ii=1 | 300 par(mar = c(2, 0, 1, 0)) |
289 goods$color=1 | 301 step <- dev.size("px")[2] / (length(goods.names) - 1) |
290 goods$color[goods$direction==1 & goods$pval>cutoff]=colors[4] | 302 left <- 1 |
291 goods$color[goods$direction==0 & goods$pval>cutoff]=colors[3] | 303 top <- step * (length(labs) - 1) |
292 goods$color[goods$direction==1 & goods$pval>(-log(level2,10))]=colors[2] | 304 # print(paste("Size:", dev.size("px"))) |
293 goods$color[goods$direction==0 & goods$pval>(-log(level2,10))]=colors[1] | 305 # print(paste("Good:", length(goods.names), "Step:", step, "Top:", top)) |
294 goods$color[goods$direction==1 & goods$pval>(-log(level3,10))]=colors[2] | 306 |
295 goods$color[goods$direction==0 & goods$pval>(-log(level3,10))]=colors[1] | 307 plot(c(1:top), c(1:top), type = "n", axes = F, xlab = "", ylab = "") |
296 for (i in length(labs):1) { | 308 ii <- 0 |
297 ypos=top-step*ii | 309 goods$color <- 1 |
298 ii=ii+1 | 310 goods$color[goods$direction == 1 & goods$pval > cutoff] <- colors[4] |
299 if (goods$pval[i]> -log(level3,10)) { | 311 goods$color[goods$direction == 0 & goods$pval > cutoff] <- colors[3] |
300 text(left,ypos,labs[i],font=2,cex=1*txtsize,col=goods$color[i],adj=c(0,0),family=font.family) | 312 goods$color[goods$direction == 1 & goods$pval > (-log(level2, 10))] <- colors[2] |
301 } else { | 313 goods$color[goods$direction == 0 & goods$pval > (-log(level2, 10))] <- colors[1] |
302 if (goods$pval[i]>-log(level2,10)) { | 314 goods$color[goods$direction == 1 & goods$pval > (-log(level3, 10))] <- colors[2] |
303 text(left,ypos,labs[i],font=1,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family) | 315 goods$color[goods$direction == 0 & goods$pval > (-log(level3, 10))] <- colors[1] |
304 } else { | 316 for (i in length(labs):1) { |
305 # if (goods$pval[i]>cutoff) { | 317 ypos <- round(top - step * ii) |
306 # text(left,ypos,labs[i],font=3,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family) | 318 # print(paste("Ypos:", ypos)) |
307 # } else { | 319 ii <- ii + 1 |
308 text(left,ypos,labs[i],font=3,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family) | 320 if (goods$pval[i] > -log(level3, 10)) { |
309 #} | 321 text(left, ypos, labs[i], font = 2, cex = 1 * txtsize, col = goods$color[i], adj = c(0, 0), family = font.family) |
310 } | 322 } else { |
311 } | 323 if (goods$pval[i] > -log(level2, 10)) { |
312 } | 324 text(left, ypos, labs[i], font = 1, cex = 0.8 * txtsize, col = goods$color[i], adj = c(0, 0), family = font.family) |
313 | 325 } else { |
314 par(mar = c(3,1,1,0)) | 326 # if (goods$pval[i]>cutoff) { |
315 | 327 # text(left,ypos,labs[i],font=3,cex=0.8* txtsize,col=goods$color[i],adj=c(0,0),family=font.family) |
316 plot(c(1:top)~c(1:top),type="n",axes=F,xlab="",ylab="") | 328 # } else { |
317 text(left,top-step*2,paste("p < ",level3,sep=""),font=2,cex=1* txtsize,adj=c(0,0),family=font.family) | 329 text(left, ypos, labs[i], font = 3, cex = 0.8 * txtsize, col = goods$color[i], adj = c(0, 0), family = font.family) |
318 text(left,top-step*3,paste("p < ",level2,sep=""),font=1,cex=0.8* txtsize,adj=c(0,0),family=font.family) | 330 # } |
319 text(left,top-step*4,paste("p < ",10^(-cutoff),sep=""),font=3,col="grey50",cex=0.8* txtsize,adj=c(0,0),family=font.family) | 331 } |
320 | 332 } |
321 message("GO terms dispayed: ",length(goods.names)) | 333 } |
322 message("\"Good genes\" accounted for: ", ngenes," out of ",totSum, " ( ",round(100*ngenes/totSum,0), "% )") | 334 |
323 par(old.par) | 335 par(mar = c(3, 1, 1, 0)) |
324 goods$pval=10^(-1*goods$pval) | 336 |
325 return(list(goods,cl.goods)) | 337 plot(c(1:top), c(1:top), type = "n", axes = F, xlab = "", ylab = "") |
338 text(left, top - step * 2, paste("p < ", level3, sep = ""), font = 2, cex = 1 * txtsize, adj = c(0, 0), family = font.family) | |
339 text(left, top - step * 3, paste("p < ", level2, sep = ""), font = 1, cex = 0.8 * txtsize, adj = c(0, 0), family = font.family) | |
340 text(left, top - step * 4, paste("p < ", 10^(-cutoff), sep = ""), font = 3, col = "grey50", cex = 0.8 * txtsize, adj = c(0, 0), family = font.family) | |
341 | |
342 message("GO terms dispayed: ", length(goods.names)) | |
343 message("\"Good genes\" accounted for: ", ngenes, " out of ", totSum, " ( ", round(100 * ngenes / totSum, 0), "% )") | |
344 par(old.par) | |
345 goods$pval <- 10^(-1 * goods$pval) | |
346 return(list(goods, cl.goods)) | |
326 } | 347 } |
327 | 348 |
328 #------------------ | 349 #------------------ |
329 # returns non-overlapping GO categories based on dissimilarity table | 350 # returns non-overlapping GO categories based on dissimilarity table |
330 indepGO=function(dissim.table,min.similarity=1) { | 351 indepGO <- function(dissim.table, min.similarity = 1) { |
331 tt=read.table(dissim.table,sep="\t", header=TRUE) | 352 tt <- read.table(dissim.table, sep = "\t", header = TRUE) |
332 tt=as.matrix(tt) | 353 tt <- as.matrix(tt) |
333 diag(tt)=1 | 354 diag(tt) <- 1 |
334 for (i in 1:ncol(tt)) { | 355 for (i in 1:ncol(tt)) { |
335 mins=apply(tt,2,min) | 356 mins <- apply(tt, 2, min) |
336 if (min(mins)>=min.similarity) { break } | 357 if (min(mins) >= min.similarity) { |
337 sums=apply(tt,2,sum) | 358 break |
338 worst=which(sums==min(sums))[1] | 359 } |
339 # cat("\n",worsts,"\n") | 360 sums <- apply(tt, 2, sum) |
340 # gw=c() | 361 worst <- which(sums == min(sums))[1] |
341 # for(w in worsts) { gw=append(gw,sum(tt[,w]==1)) } | 362 # cat("\n",worsts,"\n") |
342 # cat(gw) | 363 # gw=c() |
343 # worst=worsts[gw==min(gw)] | 364 # for(w in worsts) { gw=append(gw,sum(tt[,w]==1)) } |
344 # cat("\n",i," removing",worst,"; sum =",sums[worst]) | 365 # cat(gw) |
345 tt=tt[-worst,-worst] | 366 # worst=worsts[gw==min(gw)] |
346 mins=mins[-worst] | 367 # cat("\n",i," removing",worst,"; sum =",sums[worst]) |
347 # cat(" new min =",min(mins)) | 368 tt <- tt[-worst, -worst] |
348 } | 369 mins <- mins[-worst] |
349 goods=colnames(tt) | 370 # cat(" new min =",min(mins)) |
350 goods=gsub("GO\\.","GO:",goods) | 371 } |
351 goods=gsub("\\.GO",";GO",goods) | 372 goods <- colnames(tt) |
352 } | 373 goods <- gsub("GO\\.", "GO:", goods) |
353 | 374 goods <- gsub("\\.GO", ";GO", goods) |
375 } |