Mercurial > repos > iuc > raceid_main
comparison scripts/RaceID_class.R @ 0:e01c989c7543 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/raceid commit 39918bfdb08f06862ca395ce58a6f5e4f6dd1a5e
author | iuc |
---|---|
date | Sat, 03 Mar 2018 17:34:16 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e01c989c7543 |
---|---|
1 ## load required packages. | |
2 require(tsne) | |
3 require(pheatmap) | |
4 require(MASS) | |
5 require(cluster) | |
6 require(mclust) | |
7 require(flexmix) | |
8 require(lattice) | |
9 require(fpc) | |
10 require(amap) | |
11 require(RColorBrewer) | |
12 require(locfit) | |
13 | |
14 ## class definition | |
15 SCseq <- setClass("SCseq", slots = c(expdata = "data.frame", ndata = "data.frame", fdata = "data.frame", distances = "matrix", tsne = "data.frame", kmeans = "list", background = "list", out = "list", cpart = "vector", fcol = "vector", filterpar = "list", clusterpar = "list", outlierpar ="list" )) | |
16 | |
17 setValidity("SCseq", | |
18 function(object) { | |
19 msg <- NULL | |
20 if ( ! is.data.frame(object@expdata) ){ | |
21 msg <- c(msg, "input data must be data.frame") | |
22 }else if ( nrow(object@expdata) < 2 ){ | |
23 msg <- c(msg, "input data must have more than one row") | |
24 }else if ( ncol(object@expdata) < 2 ){ | |
25 msg <- c(msg, "input data must have more than one column") | |
26 }else if (sum( apply( is.na(object@expdata),1,sum ) ) > 0 ){ | |
27 msg <- c(msg, "NAs are not allowed in input data") | |
28 }else if (sum( apply( object@expdata,1,min ) ) < 0 ){ | |
29 msg <- c(msg, "negative values are not allowed in input data") | |
30 } | |
31 if (is.null(msg)) TRUE | |
32 else msg | |
33 } | |
34 ) | |
35 | |
36 setMethod("initialize", | |
37 signature = "SCseq", | |
38 definition = function(.Object, expdata ){ | |
39 .Object@expdata <- expdata | |
40 .Object@ndata <- expdata | |
41 .Object@fdata <- expdata | |
42 validObject(.Object) | |
43 return(.Object) | |
44 } | |
45 ) | |
46 | |
47 setGeneric("filterdata", function(object, mintotal=3000, minexpr=5, minnumber=1, maxexpr=500, downsample=FALSE, dsn=1, rseed=17000) standardGeneric("filterdata")) | |
48 | |
49 setMethod("filterdata", | |
50 signature = "SCseq", | |
51 definition = function(object,mintotal,minexpr,minnumber,maxexpr,downsample,dsn,rseed) { | |
52 if ( ! is.numeric(mintotal) ) stop( "mintotal has to be a positive number" ) else if ( mintotal <= 0 ) stop( "mintotal has to be a positive number" ) | |
53 if ( ! is.numeric(minexpr) ) stop( "minexpr has to be a non-negative number" ) else if ( minexpr < 0 ) stop( "minexpr has to be a non-negative number" ) | |
54 if ( ! is.numeric(minnumber) ) stop( "minnumber has to be a non-negative integer number" ) else if ( round(minnumber) != minnumber | minnumber < 0 ) stop( "minnumber has to be a non-negative integer number" ) | |
55 if ( ! ( is.numeric(downsample) | is.logical(downsample) ) ) stop( "downsample has to be logical (TRUE/FALSE)" ) | |
56 if ( ! is.numeric(dsn) ) stop( "dsn has to be a positive integer number" ) else if ( round(dsn) != dsn | dsn <= 0 ) stop( "dsn has to be a positive integer number" ) | |
57 object@filterpar <- list(mintotal=mintotal, minexpr=minexpr, minnumber=minnumber, maxexpr=maxexpr, downsample=downsample, dsn=dsn) | |
58 object@ndata <- object@expdata[,apply(object@expdata,2,sum,na.rm=TRUE) >= mintotal] | |
59 if ( downsample ){ | |
60 set.seed(rseed) | |
61 object@ndata <- downsample(object@expdata,n=mintotal,dsn=dsn) | |
62 }else{ | |
63 x <- object@ndata | |
64 object@ndata <- as.data.frame( t(t(x)/apply(x,2,sum))*median(apply(x,2,sum,na.rm=TRUE)) + .1 ) | |
65 } | |
66 x <- object@ndata | |
67 object@fdata <- x[apply(x>=minexpr,1,sum,na.rm=TRUE) >= minnumber,] | |
68 x <- object@fdata | |
69 object@fdata <- x[apply(x,1,max,na.rm=TRUE) < maxexpr,] | |
70 return(object) | |
71 } | |
72 ) | |
73 | |
74 | |
75 downsample <- function(x,n,dsn){ | |
76 x <- round( x[,apply(x,2,sum,na.rm=TRUE) >= n], 0) | |
77 nn <- min( apply(x,2,sum) ) | |
78 for ( j in 1:dsn ){ | |
79 z <- data.frame(GENEID=rownames(x)) | |
80 rownames(z) <- rownames(x) | |
81 initv <- rep(0,nrow(z)) | |
82 for ( i in 1:dim(x)[2] ){ | |
83 y <- aggregate(rep(1,nn),list(sample(rep(rownames(x),x[,i]),nn)),sum) | |
84 na <- names(x)[i] | |
85 names(y) <- c("GENEID",na) | |
86 rownames(y) <- y$GENEID | |
87 z[,na] <- initv | |
88 k <- intersect(rownames(z),y$GENEID) | |
89 z[k,na] <- y[k,na] | |
90 z[is.na(z[,na]),na] <- 0 | |
91 } | |
92 rownames(z) <- as.vector(z$GENEID) | |
93 ds <- if ( j == 1 ) z[,-1] else ds + z[,-1] | |
94 } | |
95 ds <- ds/dsn + .1 | |
96 return(ds) | |
97 } | |
98 | |
99 dist.gen <- function(x,method="euclidean", ...) if ( method %in% c("spearman","pearson","kendall") ) as.dist( 1 - cor(t(x),method=method,...) ) else dist(x,method=method,...) | |
100 | |
101 dist.gen.pairs <- function(x,y,...) dist.gen(t(cbind(x,y)),...) | |
102 | |
103 clustfun <- function(x,clustnr=20,bootnr=50,metric="pearson",do.gap=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=0,rseed=17000) | |
104 { | |
105 if ( clustnr < 2) stop("Choose clustnr > 1") | |
106 di <- dist.gen(t(x),method=metric) | |
107 if ( do.gap | cln > 0 ){ | |
108 gpr <- NULL | |
109 if ( do.gap ){ | |
110 set.seed(rseed) | |
111 gpr <- clusGap(as.matrix(di), FUN = kmeans, K.max = clustnr, B = B.gap) | |
112 if ( cln == 0 ) cln <- maxSE(gpr$Tab[,3],gpr$Tab[,4],method=SE.method,SE.factor) | |
113 } | |
114 if ( cln <= 1 ) { | |
115 clb <- list(result=list(partition=rep(1,dim(x)[2])),bootmean=1) | |
116 names(clb$result$partition) <- names(x) | |
117 return(list(x=x,clb=clb,gpr=gpr,di=di)) | |
118 } | |
119 clb <- clusterboot(di,B=bootnr,distances=FALSE,bootmethod="boot",clustermethod=KmeansCBI,krange=cln,scaling=FALSE,multipleboot=FALSE,bscompare=TRUE,seed=rseed) | |
120 return(list(x=x,clb=clb,gpr=gpr,di=di)) | |
121 } | |
122 } | |
123 | |
124 KmeansCBI <- function (data, krange, k = NULL, scaling = FALSE, runs = 1, | |
125 criterion = "ch", method="euclidean",...) | |
126 { | |
127 if (!is.null(k)) | |
128 krange <- k | |
129 if (!identical(scaling, FALSE)) | |
130 sdata <- scale(data, center = TRUE, scale = scaling) | |
131 else sdata <- data | |
132 c1 <- Kmeansruns(sdata, krange, runs = runs, criterion = criterion, method = method, | |
133 ...) | |
134 partition <- c1$cluster | |
135 cl <- list() | |
136 nc <- krange | |
137 for (i in 1:nc) cl[[i]] <- partition == i | |
138 out <- list(result = c1, nc = nc, clusterlist = cl, partition = partition, | |
139 clustermethod = "kmeans") | |
140 out | |
141 } | |
142 | |
143 Kmeansruns <- function (data, krange = 2:10, criterion = "ch", iter.max = 100, | |
144 runs = 100, scaledata = FALSE, alpha = 0.001, critout = FALSE, | |
145 plot = FALSE, method="euclidean", ...) | |
146 { | |
147 data <- as.matrix(data) | |
148 if (criterion == "asw") | |
149 sdata <- dist(data) | |
150 if (scaledata) | |
151 data <- scale(data) | |
152 cluster1 <- 1 %in% krange | |
153 crit <- numeric(max(krange)) | |
154 km <- list() | |
155 for (k in krange) { | |
156 if (k > 1) { | |
157 minSS <- Inf | |
158 kmopt <- NULL | |
159 for (i in 1:runs) { | |
160 options(show.error.messages = FALSE) | |
161 repeat { | |
162 kmm <- try(Kmeans(data, k, iter.max = iter.max, method=method, | |
163 ...)) | |
164 if (class(kmm) != "try-error") | |
165 break | |
166 } | |
167 options(show.error.messages = TRUE) | |
168 swss <- sum(kmm$withinss) | |
169 if (swss < minSS) { | |
170 kmopt <- kmm | |
171 minSS <- swss | |
172 } | |
173 if (plot) { | |
174 par(ask = TRUE) | |
175 pairs(data, col = kmm$cluster, main = swss) | |
176 } | |
177 } | |
178 km[[k]] <- kmopt | |
179 crit[k] <- switch(criterion, asw = cluster.stats(sdata, | |
180 km[[k]]$cluster)$avg.silwidth, ch = calinhara(data, | |
181 km[[k]]$cluster)) | |
182 if (critout) | |
183 cat(k, " clusters ", crit[k], "\n") | |
184 } | |
185 } | |
186 if (cluster1) | |
187 cluster1 <- dudahart2(data, km[[2]]$cluster, alpha = alpha)$cluster1 | |
188 k.best <- which.max(crit) | |
189 if (cluster1) | |
190 k.best <- 1 | |
191 km[[k.best]]$crit <- crit | |
192 km[[k.best]]$bestk <- k.best | |
193 out <- km[[k.best]] | |
194 out | |
195 } | |
196 | |
197 binompval <- function(p,N,n){ | |
198 pval <- pbinom(n,round(N,0),p,lower.tail=TRUE) | |
199 pval[!is.na(pval) & pval > 0.5] <- 1-pval[!is.na(pval) & pval > 0.5] | |
200 return(pval) | |
201 } | |
202 | |
203 setGeneric("clustexp", function(object,clustnr=20,bootnr=50,metric="pearson",do.gap=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=0,rseed=17000) standardGeneric("clustexp")) | |
204 | |
205 setMethod("clustexp", | |
206 signature = "SCseq", | |
207 definition = function(object,clustnr,bootnr,metric,do.gap,SE.method,SE.factor,B.gap,cln,rseed) { | |
208 if ( ! is.numeric(clustnr) ) stop("clustnr has to be a positive integer") else if ( round(clustnr) != clustnr | clustnr <= 0 ) stop("clustnr has to be a positive integer") | |
209 if ( ! is.numeric(bootnr) ) stop("bootnr has to be a positive integer") else if ( round(bootnr) != bootnr | bootnr <= 0 ) stop("bootnr has to be a positive integer") | |
210 if ( ! ( metric %in% c( "spearman","pearson","kendall","euclidean","maximum","manhattan","canberra","binary","minkowski") ) ) stop("metric has to be one of the following: spearman, pearson, kendall, euclidean, maximum, manhattan, canberra, binary, minkowski") | |
211 if ( ! ( SE.method %in% c( "firstSEmax","Tibs2001SEmax","globalSEmax","firstmax","globalmax") ) ) stop("SE.method has to be one of the following: firstSEmax, Tibs2001SEmax, globalSEmax, firstmax, globalmax") | |
212 if ( ! is.numeric(SE.factor) ) stop("SE.factor has to be a non-negative integer") else if ( SE.factor < 0 ) stop("SE.factor has to be a non-negative integer") | |
213 if ( ! ( is.numeric(do.gap) | is.logical(do.gap) ) ) stop( "do.gap has to be logical (TRUE/FALSE)" ) | |
214 if ( ! is.numeric(B.gap) ) stop("B.gap has to be a positive integer") else if ( round(B.gap) != B.gap | B.gap <= 0 ) stop("B.gap has to be a positive integer") | |
215 if ( ! is.numeric(cln) ) stop("cln has to be a non-negative integer") else if ( round(cln) != cln | cln < 0 ) stop("cln has to be a non-negative integer") | |
216 if ( ! is.numeric(rseed) ) stop("rseed has to be numeric") | |
217 if ( !do.gap & cln == 0 ) stop("cln has to be a positive integer or do.gap has to be TRUE") | |
218 object@clusterpar <- list(clustnr=clustnr,bootnr=bootnr,metric=metric,do.gap=do.gap,SE.method=SE.method,SE.factor=SE.factor,B.gap=B.gap,cln=cln,rseed=rseed) | |
219 y <- clustfun(object@fdata,clustnr,bootnr,metric,do.gap,SE.method,SE.factor,B.gap,cln,rseed) | |
220 object@kmeans <- list(kpart=y$clb$result$partition, jaccard=y$clb$bootmean, gap=y$gpr) | |
221 object@distances <- as.matrix( y$di ) | |
222 set.seed(111111) | |
223 object@fcol <- sample(rainbow(max(y$clb$result$partition))) | |
224 return(object) | |
225 } | |
226 ) | |
227 | |
228 setGeneric("findoutliers", function(object,outminc=5,outlg=2,probthr=1e-3,thr=2**-(1:40),outdistquant=.75) standardGeneric("findoutliers")) | |
229 | |
230 setMethod("findoutliers", | |
231 signature = "SCseq", | |
232 definition = function(object,outminc,outlg,probthr,thr,outdistquant) { | |
233 if ( length(object@kmeans$kpart) == 0 ) stop("run clustexp before findoutliers") | |
234 if ( ! is.numeric(outminc) ) stop("outminc has to be a non-negative integer") else if ( round(outminc) != outminc | outminc < 0 ) stop("outminc has to be a non-negative integer") | |
235 if ( ! is.numeric(outlg) ) stop("outlg has to be a non-negative integer") else if ( round(outlg) != outlg | outlg < 0 ) stop("outlg has to be a non-negative integer") | |
236 if ( ! is.numeric(probthr) ) stop("probthr has to be a number between 0 and 1") else if ( probthr < 0 | probthr > 1 ) stop("probthr has to be a number between 0 and 1") | |
237 if ( ! is.numeric(thr) ) stop("thr hast to be a vector of numbers between 0 and 1") else if ( min(thr) < 0 | max(thr) > 1 ) stop("thr hast to be a vector of numbers between 0 and 1") | |
238 if ( ! is.numeric(outdistquant) ) stop("outdistquant has to be a number between 0 and 1") else if ( outdistquant < 0 | outdistquant > 1 ) stop("outdistquant has to be a number between 0 and 1") | |
239 | |
240 object@outlierpar <- list( outminc=outminc,outlg=outlg,probthr=probthr,thr=thr,outdistquant=outdistquant ) | |
241 ### calibrate background model | |
242 m <- log2(apply(object@fdata,1,mean)) | |
243 v <- log2(apply(object@fdata,1,var)) | |
244 f <- m > -Inf & v > -Inf | |
245 m <- m[f] | |
246 v <- v[f] | |
247 mm <- -8 | |
248 repeat{ | |
249 fit <- lm(v ~ m + I(m^2)) | |
250 if( coef(fit)[3] >= 0 | mm >= 3){ | |
251 break | |
252 } | |
253 mm <- mm + .5 | |
254 f <- m > mm | |
255 m <- m[f] | |
256 v <- v[f] | |
257 } | |
258 object@background <- list() | |
259 object@background$vfit <- fit | |
260 object@background$lvar <- function(x,object) 2**(coef(object@background$vfit)[1] + log2(x)*coef(object@background$vfit)[2] + coef(object@background$vfit)[3] * log2(x)**2) | |
261 object@background$lsize <- function(x,object) x**2/(max(x + 1e-6,object@background$lvar(x,object)) - x) | |
262 | |
263 ### identify outliers | |
264 out <- c() | |
265 stest <- rep(0,length(thr)) | |
266 cprobs <- c() | |
267 for ( n in 1:max(object@kmeans$kpart) ){ | |
268 if ( sum(object@kmeans$kpart == n) == 1 ){ cprobs <- append(cprobs,.5); names(cprobs)[length(cprobs)] <- names(object@kmeans$kpart)[object@kmeans$kpart == n]; next } | |
269 x <- object@fdata[,object@kmeans$kpart == n] | |
270 x <- x[apply(x,1,max) > outminc,] | |
271 z <- t( apply(x,1,function(x){ apply( cbind( pnbinom(round(x,0),mu=mean(x),size=object@background$lsize(mean(x),object)) , 1 - pnbinom(round(x,0),mu=mean(x),size=object@background$lsize(mean(x),object)) ),1, min) } ) ) | |
272 cp <- apply(z,2,function(x){ y <- p.adjust(x,method="BH"); y <- y[order(y,decreasing=FALSE)]; return(y[outlg]);}) | |
273 f <- cp < probthr | |
274 cprobs <- append(cprobs,cp) | |
275 if ( sum(f) > 0 ) out <- append(out,names(x)[f]) | |
276 for ( j in 1:length(thr) ) stest[j] <- stest[j] + sum( cp < thr[j] ) | |
277 } | |
278 object@out <-list(out=out,stest=stest,thr=thr,cprobs=cprobs) | |
279 | |
280 ### cluster outliers | |
281 clp2p.cl <- c() | |
282 cols <- names(object@fdata) | |
283 di <- as.data.frame(object@distances) | |
284 for ( i in 1:max(object@kmeans$kpart) ) { | |
285 tcol <- cols[object@kmeans$kpart == i] | |
286 if ( sum(!(tcol %in% out)) > 1 ) clp2p.cl <- append(clp2p.cl,as.vector(t(di[tcol[!(tcol %in% out)],tcol[!(tcol %in% out)]]))) | |
287 } | |
288 clp2p.cl <- clp2p.cl[clp2p.cl>0] | |
289 | |
290 cpart <- object@kmeans$kpart | |
291 cadd <- list() | |
292 if ( length(out) > 0 ){ | |
293 if (length(out) == 1){ | |
294 cadd <- list(out) | |
295 }else{ | |
296 n <- out | |
297 m <- as.data.frame(di[out,out]) | |
298 | |
299 for ( i in 1:length(out) ){ | |
300 if ( length(n) > 1 ){ | |
301 o <- order(apply(cbind(m,1:dim(m)[1]),1,function(x) min(x[1:(length(x)-1)][-x[length(x)]])),decreasing=FALSE) | |
302 m <- m[o,o] | |
303 n <- n[o] | |
304 f <- m[,1] < quantile(clp2p.cl,outdistquant) | m[,1] == min(clp2p.cl) | |
305 ind <- 1 | |
306 if ( sum(f) > 1 ) for ( j in 2:sum(f) ) if ( apply(m[f,f][j,c(ind,j)] > quantile(clp2p.cl,outdistquant) ,1,sum) == 0 ) ind <- append(ind,j) | |
307 cadd[[i]] <- n[f][ind] | |
308 g <- ! n %in% n[f][ind] | |
309 n <- n[g] | |
310 m <- m[g,g] | |
311 if ( sum(g) == 0 ) break | |
312 | |
313 }else if (length(n) == 1){ | |
314 cadd[[i]] <- n | |
315 break | |
316 } | |
317 } | |
318 } | |
319 | |
320 for ( i in 1:length(cadd) ){ | |
321 cpart[cols %in% cadd[[i]]] <- max(cpart) + 1 | |
322 } | |
323 } | |
324 | |
325 ### determine final clusters | |
326 for ( i in 1:max(cpart) ){ | |
327 d <- object@fdata[,cols[cpart == i]] | |
328 if ( sum(cpart == i) == 1 ) cent <- d else cent <- apply(d,1,mean) | |
329 if ( i == 1 ) dcent <- data.frame(cent) else dcent <- cbind(dcent,cent) | |
330 if ( i == 1 ) tmp <- data.frame(apply(object@fdata[,cols],2,dist.gen.pairs,y=cent,method=object@clusterpar$metric)) else tmp <- cbind(tmp,apply(object@fdata[,cols],2,dist.gen.pairs,y=cent,method=object@clusterpar$metric)) | |
331 } | |
332 cpart <- apply(tmp,1,function(x) order(x,decreasing=FALSE)[1]) | |
333 | |
334 for ( i in max(cpart):1){if (sum(cpart==i)==0) cpart[cpart>i] <- cpart[cpart>i] - 1 } | |
335 | |
336 object@cpart <- cpart | |
337 | |
338 set.seed(111111) | |
339 object@fcol <- sample(rainbow(max(cpart))) | |
340 return(object) | |
341 } | |
342 ) | |
343 | |
344 setGeneric("plotgap", function(object) standardGeneric("plotgap")) | |
345 | |
346 setMethod("plotgap", | |
347 signature = "SCseq", | |
348 definition = function(object){ | |
349 if ( length(object@kmeans$kpart) == 0 ) stop("run clustexp before plotgap") | |
350 plot(object@kmeans$gap) | |
351 } | |
352 ) | |
353 | |
354 setGeneric("plotsilhouette", function(object) standardGeneric("plotsilhouette")) | |
355 | |
356 setMethod("plotsilhouette", | |
357 signature = "SCseq", | |
358 definition = function(object){ | |
359 if ( length(object@kmeans$kpart) == 0 ) stop("run clustexp before plotsilhouette") | |
360 if ( length(unique(object@kmeans$kpart)) < 2 ) stop("only a single cluster: no silhouette plot") | |
361 kpart <- object@kmeans$kpart | |
362 distances <- dist.gen(object@distances) | |
363 si <- silhouette(kpart,distances) | |
364 plot(si) | |
365 } | |
366 ) | |
367 | |
368 setGeneric("plotjaccard", function(object) standardGeneric("plotjaccard")) | |
369 | |
370 setMethod("plotjaccard", | |
371 signature = "SCseq", | |
372 definition = function(object){ | |
373 if ( length(object@kmeans$kpart) == 0 ) stop("run clustexp before plotjaccard") | |
374 if ( length(unique(object@kmeans$kpart)) < 2 ) stop("only a single cluster: no Jaccard's similarity plot") | |
375 barplot(object@kmeans$jaccard,names.arg=1:length(object@kmeans$jaccard),ylab="Jaccard's similarity") | |
376 } | |
377 ) | |
378 | |
379 setGeneric("plotoutlierprobs", function(object) standardGeneric("plotoutlierprobs")) | |
380 | |
381 setMethod("plotoutlierprobs", | |
382 signature = "SCseq", | |
383 definition = function(object){ | |
384 if ( length(object@cpart) == 0 ) stop("run findoutliers before plotoutlierprobs") | |
385 p <- object@kmeans$kpart[ order(object@kmeans$kpart,decreasing=FALSE)] | |
386 x <- object@out$cprobs[names(p)] | |
387 fcol <- object@fcol | |
388 for ( i in 1:max(p) ){ | |
389 y <- -log10(x + 2.2e-16) | |
390 y[p != i] <- 0 | |
391 if ( i == 1 ) b <- barplot(y,ylim=c(0,max(-log10(x + 2.2e-16))*1.1),col=fcol[i],border=fcol[i],names.arg=FALSE,ylab="-log10prob") else barplot(y,add=TRUE,col=fcol[i],border=fcol[i],names.arg=FALSE,axes=FALSE) | |
392 } | |
393 abline(-log10(object@outlierpar$probthr),0,col="black",lty=2) | |
394 d <- b[2,1] - b[1,1] | |
395 y <- 0 | |
396 for ( i in 1:max(p) ) y <- append(y,b[sum(p <=i),1] + d/2) | |
397 axis(1,at=(y[1:(length(y)-1)] + y[-1])/2,lab=1:max(p)) | |
398 box() | |
399 } | |
400 ) | |
401 | |
402 setGeneric("plotbackground", function(object) standardGeneric("plotbackground")) | |
403 | |
404 setMethod("plotbackground", | |
405 signature = "SCseq", | |
406 definition = function(object){ | |
407 if ( length(object@cpart) == 0 ) stop("run findoutliers before plotbackground") | |
408 m <- apply(object@fdata,1,mean) | |
409 v <- apply(object@fdata,1,var) | |
410 fit <- locfit(v~lp(m,nn=.7),family="gamma",maxk=500) | |
411 plot(log2(m),log2(v),pch=20,xlab="log2mean",ylab="log2var",col="grey") | |
412 lines(log2(m[order(m)]),log2(object@background$lvar(m[order(m)],object)),col="red",lwd=2) | |
413 lines(log2(m[order(m)]),log2(fitted(fit)[order(m)]),col="orange",lwd=2,lty=2) | |
414 legend("topleft",legend=substitute(paste("y = ",a,"*x^2 + ",b,"*x + ",d,sep=""),list(a=round(coef(object@background$vfit)[3],2),b=round(coef(object@background$vfit)[2],2),d=round(coef(object@background$vfit)[1],2))),bty="n") | |
415 } | |
416 ) | |
417 | |
418 setGeneric("plotsensitivity", function(object) standardGeneric("plotsensitivity")) | |
419 | |
420 setMethod("plotsensitivity", | |
421 signature = "SCseq", | |
422 definition = function(object){ | |
423 if ( length(object@cpart) == 0 ) stop("run findoutliers before plotsensitivity") | |
424 plot(log10(object@out$thr), object@out$stest, type="l",xlab="log10 Probability cutoff", ylab="Number of outliers") | |
425 abline(v=log10(object@outlierpar$probthr),col="red",lty=2) | |
426 } | |
427 ) | |
428 | |
429 setGeneric("clustdiffgenes", function(object,pvalue=.01) standardGeneric("clustdiffgenes")) | |
430 | |
431 setMethod("clustdiffgenes", | |
432 signature = "SCseq", | |
433 definition = function(object,pvalue){ | |
434 if ( length(object@cpart) == 0 ) stop("run findoutliers before clustdiffgenes") | |
435 if ( ! is.numeric(pvalue) ) stop("pvalue has to be a number between 0 and 1") else if ( pvalue < 0 | pvalue > 1 ) stop("pvalue has to be a number between 0 and 1") | |
436 cdiff <- list() | |
437 x <- object@ndata | |
438 y <- object@expdata[,names(object@ndata)] | |
439 part <- object@cpart | |
440 for ( i in 1:max(part) ){ | |
441 if ( sum(part == i) == 0 ) next | |
442 m <- apply(x,1,mean) | |
443 n <- if ( sum(part == i) > 1 ) apply(x[,part == i],1,mean) else x[,part == i] | |
444 no <- if ( sum(part == i) > 1 ) median(apply(y[,part == i],2,sum))/median(apply(x[,part == i],2,sum)) else sum(y[,part == i])/sum(x[,part == i]) | |
445 m <- m*no | |
446 n <- n*no | |
447 pv <- binompval(m/sum(m),sum(n),n) | |
448 d <- data.frame(mean.all=m,mean.cl=n,fc=n/m,pv=pv)[order(pv,decreasing=FALSE),] | |
449 cdiff[[paste("cl",i,sep=".")]] <- d[d$pv < pvalue,] | |
450 } | |
451 return(cdiff) | |
452 } | |
453 ) | |
454 | |
455 setGeneric("diffgenes", function(object,cl1,cl2,mincount=5) standardGeneric("diffgenes")) | |
456 | |
457 setMethod("diffgenes", | |
458 signature = "SCseq", | |
459 definition = function(object,cl1,cl2,mincount){ | |
460 part <- object@cpart | |
461 cl1 <- c(cl1) | |
462 cl2 <- c(cl2) | |
463 if ( length(part) == 0 ) stop("run findoutliers before diffgenes") | |
464 if ( ! is.numeric(mincount) ) stop("mincount has to be a non-negative number") else if ( mincount < 0 ) stop("mincount has to be a non-negative number") | |
465 if ( length(intersect(cl1, part)) < length(unique(cl1)) ) stop( paste("cl1 has to be a subset of ",paste(sort(unique(part)),collapse=","),"\n",sep="") ) | |
466 if ( length(intersect(cl2, part)) < length(unique(cl2)) ) stop( paste("cl2 has to be a subset of ",paste(sort(unique(part)),collapse=","),"\n",sep="") ) | |
467 f <- apply(object@ndata[,part %in% c(cl1,cl2)],1,max) > mincount | |
468 x <- object@ndata[f,part %in% cl1] | |
469 y <- object@ndata[f,part %in% cl2] | |
470 if ( sum(part %in% cl1) == 1 ) m1 <- x else m1 <- apply(x,1,mean) | |
471 if ( sum(part %in% cl2) == 1 ) m2 <- y else m2 <- apply(y,1,mean) | |
472 if ( sum(part %in% cl1) == 1 ) s1 <- sqrt(x) else s1 <- sqrt(apply(x,1,var)) | |
473 if ( sum(part %in% cl2) == 1 ) s2 <- sqrt(y) else s2 <- sqrt(apply(y,1,var)) | |
474 | |
475 d <- ( m1 - m2 )/ apply( cbind( s1, s2 ),1,mean ) | |
476 names(d) <- rownames(object@ndata)[f] | |
477 if ( sum(part %in% cl1) == 1 ){ | |
478 names(x) <- names(d) | |
479 x <- x[order(d,decreasing=TRUE)] | |
480 }else{ | |
481 x <- x[order(d,decreasing=TRUE),] | |
482 } | |
483 if ( sum(part %in% cl2) == 1 ){ | |
484 names(y) <- names(d) | |
485 y <- y[order(d,decreasing=TRUE)] | |
486 }else{ | |
487 y <- y[order(d,decreasing=TRUE),] | |
488 } | |
489 return(list(z=d[order(d,decreasing=TRUE)],cl1=x,cl2=y,cl1n=cl1,cl2n=cl2)) | |
490 } | |
491 ) | |
492 | |
493 plotdiffgenes <- function(z,gene=g){ | |
494 if ( ! is.list(z) ) stop("first arguments needs to be output of function diffgenes") | |
495 if ( length(z) < 5 ) stop("first arguments needs to be output of function diffgenes") | |
496 if ( sum(names(z) == c("z","cl1","cl2","cl1n","cl2n")) < 5 ) stop("first arguments needs to be output of function diffgenes") | |
497 if ( length(gene) > 1 ) stop("only single value allowed for argument gene") | |
498 if ( !is.numeric(gene) & !(gene %in% names(z$z)) ) stop("argument gene needs to be within rownames of first argument or a positive integer number") | |
499 if ( is.numeric(gene) ){ if ( gene < 0 | round(gene) != gene ) stop("argument gene needs to be within rownames of first argument or a positive integer number") } | |
500 x <- if ( is.null(dim(z$cl1)) ) z$cl1[gene] else t(z$cl1[gene,]) | |
501 y <- if ( is.null(dim(z$cl2)) ) z$cl2[gene] else t(z$cl2[gene,]) | |
502 plot(1:length(c(x,y)),c(x,y),ylim=c(0,max(c(x,y))),xlab="",ylab="Expression",main=gene,cex=0,axes=FALSE) | |
503 axis(2) | |
504 box() | |
505 u <- 1:length(x) | |
506 rect(u - .5,0,u + .5,x,col="red") | |
507 v <- c(min(u) - .5,max(u) + .5) | |
508 axis(1,at=mean(v),lab=paste(z$cl1n,collapse=",")) | |
509 lines(v,rep(mean(x),length(v))) | |
510 lines(v,rep(mean(x)-sqrt(var(x)),length(v)),lty=2) | |
511 lines(v,rep(mean(x)+sqrt(var(x)),length(v)),lty=2) | |
512 | |
513 u <- ( length(x) + 1 ):length(c(x,y)) | |
514 v <- c(min(u) - .5,max(u) + .5) | |
515 rect(u - .5,0,u + .5,y,col="blue") | |
516 axis(1,at=mean(v),lab=paste(z$cl2n,collapse=",")) | |
517 lines(v,rep(mean(y),length(v))) | |
518 lines(v,rep(mean(y)-sqrt(var(y)),length(v)),lty=2) | |
519 lines(v,rep(mean(y)+sqrt(var(y)),length(v)),lty=2) | |
520 abline(v=length(x) + .5) | |
521 } | |
522 | |
523 setGeneric("clustheatmap", function(object,final=FALSE,hmethod="single") standardGeneric("clustheatmap")) | |
524 | |
525 setMethod("clustheatmap", | |
526 signature = "SCseq", | |
527 definition = function(object,final,hmethod){ | |
528 if ( final & length(object@cpart) == 0 ) stop("run findoutliers before clustheatmap") | |
529 if ( !final & length(object@kmeans$kpart) == 0 ) stop("run clustexp before clustheatmap") | |
530 x <- object@fdata | |
531 part <- if ( final ) object@cpart else object@kmeans$kpart | |
532 na <- c() | |
533 j <- 0 | |
534 for ( i in 1:max(part) ){ | |
535 if ( sum(part == i) == 0 ) next | |
536 j <- j + 1 | |
537 na <- append(na,i) | |
538 d <- x[,part == i] | |
539 if ( sum(part == i) == 1 ) cent <- d else cent <- apply(d,1,mean) | |
540 if ( j == 1 ) tmp <- data.frame(cent) else tmp <- cbind(tmp,cent) | |
541 } | |
542 names(tmp) <- paste("cl",na,sep=".") | |
543 if ( max(part) > 1 ) cclmo <- hclust(dist.gen(as.matrix(dist.gen(t(tmp),method=object@clusterpar$metric))),method=hmethod)$order else cclmo <- 1 | |
544 q <- part | |
545 for ( i in 1:max(part) ){ | |
546 q[part == na[cclmo[i]]] <- i | |
547 } | |
548 part <- q | |
549 di <- as.data.frame( as.matrix( dist.gen(t(object@distances)) ) ) | |
550 pto <- part[order(part,decreasing=FALSE)] | |
551 ptn <- c() | |
552 for ( i in 1:max(pto) ){ pt <- names(pto)[pto == i]; z <- if ( length(pt) == 1 ) pt else pt[hclust(as.dist(t(di[pt,pt])),method=hmethod)$order]; ptn <- append(ptn,z) } | |
553 col <- object@fcol | |
554 mi <- min(di,na.rm=TRUE) | |
555 ma <- max(di,na.rm=TRUE) | |
556 layout(matrix(data=c(1,3,2,4), nrow=2, ncol=2), widths=c(5,1,5,1), heights=c(5,1,1,1)) | |
557 ColorRamp <- colorRampPalette(brewer.pal(n = 7,name = "RdYlBu"))(100) | |
558 ColorLevels <- seq(mi, ma, length=length(ColorRamp)) | |
559 if ( mi == ma ){ | |
560 ColorLevels <- seq(0.99*mi, 1.01*ma, length=length(ColorRamp)) | |
561 } | |
562 par(mar = c(3,5,2.5,2)) | |
563 image(as.matrix(di[ptn,ptn]),col=ColorRamp,axes=FALSE) | |
564 abline(0,1) | |
565 box() | |
566 | |
567 tmp <- c() | |
568 for ( u in 1:max(part) ){ | |
569 ol <- (0:(length(part) - 1)/(length(part) - 1))[ptn %in% names(x)[part == u]] | |
570 points(rep(0,length(ol)),ol,col=col[cclmo[u]],pch=15,cex=.75) | |
571 points(ol,rep(0,length(ol)),col=col[cclmo[u]],pch=15,cex=.75) | |
572 tmp <- append(tmp,mean(ol)) | |
573 } | |
574 axis(1,at=tmp,lab=cclmo) | |
575 axis(2,at=tmp,lab=cclmo) | |
576 par(mar = c(3,2.5,2.5,2)) | |
577 image(1, ColorLevels, | |
578 matrix(data=ColorLevels, ncol=length(ColorLevels),nrow=1), | |
579 col=ColorRamp, | |
580 xlab="",ylab="", | |
581 xaxt="n") | |
582 layout(1) | |
583 return(cclmo) | |
584 } | |
585 ) | |
586 | |
587 setGeneric("comptsne", function(object,rseed=15555) standardGeneric("comptsne")) | |
588 | |
589 setMethod("comptsne", | |
590 signature = "SCseq", | |
591 definition = function(object,rseed){ | |
592 if ( length(object@kmeans$kpart) == 0 ) stop("run clustexp before comptsne") | |
593 set.seed(rseed) | |
594 di <- dist.gen(as.matrix(object@distances)) | |
595 ts <- tsne(di,k=2) | |
596 object@tsne <- as.data.frame(ts) | |
597 return(object) | |
598 } | |
599 ) | |
600 | |
601 setGeneric("plottsne", function(object,final=TRUE) standardGeneric("plottsne")) | |
602 | |
603 setMethod("plottsne", | |
604 signature = "SCseq", | |
605 definition = function(object,final){ | |
606 if ( length(object@tsne) == 0 ) stop("run comptsne before plottsne") | |
607 if ( final & length(object@cpart) == 0 ) stop("run findoutliers before plottsne") | |
608 if ( !final & length(object@kmeans$kpart) == 0 ) stop("run clustexp before plottsne") | |
609 part <- if ( final ) object@cpart else object@kmeans$kpart | |
610 plot(object@tsne,xlab="Dim 1",ylab="Dim 2",pch=20,cex=1.5,col="lightgrey") | |
611 for ( i in 1:max(part) ){ | |
612 if ( sum(part == i) > 0 ) text(object@tsne[part == i,1],object@tsne[part == i,2],i,col=object@fcol[i],cex=.75,font=4) | |
613 } | |
614 } | |
615 ) | |
616 | |
617 setGeneric("plotlabelstsne", function(object,labels=NULL) standardGeneric("plotlabelstsne")) | |
618 | |
619 setMethod("plotlabelstsne", | |
620 signature = "SCseq", | |
621 definition = function(object,labels){ | |
622 if ( is.null(labels ) ) labels <- names(object@ndata) | |
623 if ( length(object@tsne) == 0 ) stop("run comptsne before plotlabelstsne") | |
624 plot(object@tsne,xlab="Dim 1",ylab="Dim 2",pch=20,cex=1.5,col="lightgrey") | |
625 text(object@tsne[,1],object@tsne[,2],labels,cex=.5) | |
626 } | |
627 ) | |
628 | |
629 setGeneric("plotsymbolstsne", function(object,types=NULL) standardGeneric("plotsymbolstsne")) | |
630 | |
631 setMethod("plotsymbolstsne", | |
632 signature = "SCseq", | |
633 definition = function(object,types){ | |
634 if ( is.null(types) ) types <- names(object@fdata) | |
635 if ( length(object@tsne) == 0 ) stop("run comptsne before plotsymbolstsne") | |
636 if ( length(types) != ncol(object@fdata) ) stop("types argument has wrong length. Length has to equal to the column number of object@ndata") | |
637 coloc <- rainbow(length(unique(types))) | |
638 syms <- c() | |
639 plot(object@tsne,xlab="Dim 1",ylab="Dim 2",pch=20,col="grey") | |
640 for ( i in 1:length(unique(types)) ){ | |
641 f <- types == sort(unique(types))[i] | |
642 syms <- append( syms, ( (i-1) %% 25 ) + 1 ) | |
643 points(object@tsne[f,1],object@tsne[f,2],col=coloc[i],pch=( (i-1) %% 25 ) + 1,cex=1) | |
644 } | |
645 legend("topleft", legend=sort(unique(types)), col=coloc, pch=syms) | |
646 } | |
647 ) | |
648 | |
649 setGeneric("plotexptsne", function(object,g,n="",logsc=FALSE) standardGeneric("plotexptsne")) | |
650 | |
651 setMethod("plotexptsne", | |
652 signature = "SCseq", | |
653 definition = function(object,g,n="",logsc=FALSE){ | |
654 if ( length(object@tsne) == 0 ) stop("run comptsne before plottsne") | |
655 if ( length(intersect(g,rownames(object@ndata))) < length(unique(g)) ) stop("second argument does not correspond to set of rownames slot ndata of SCseq object") | |
656 if ( !is.numeric(logsc) & !is.logical(logsc) ) stop("argument logsc has to be logical (TRUE/FALSE)") | |
657 if ( n == "" ) n <- g[1] | |
658 l <- apply(object@ndata[g,] - .1,2,sum) + .1 | |
659 if (logsc) { | |
660 f <- l == 0 | |
661 l <- log(l) | |
662 l[f] <- NA | |
663 } | |
664 mi <- min(l,na.rm=TRUE) | |
665 ma <- max(l,na.rm=TRUE) | |
666 ColorRamp <- colorRampPalette(rev(brewer.pal(n = 7,name = "RdYlBu")))(100) | |
667 ColorLevels <- seq(mi, ma, length=length(ColorRamp)) | |
668 v <- round((l - mi)/(ma - mi)*99 + 1,0) | |
669 layout(matrix(data=c(1,3,2,4), nrow=2, ncol=2), widths=c(5,1,5,1), heights=c(5,1,1,1)) | |
670 par(mar = c(3,5,2.5,2)) | |
671 plot(object@tsne,xlab="Dim 1",ylab="Dim 2",main=n,pch=20,cex=0,col="grey") | |
672 for ( k in 1:length(v) ){ | |
673 points(object@tsne[k,1],object@tsne[k,2],col=ColorRamp[v[k]],pch=20,cex=1.5) | |
674 } | |
675 par(mar = c(3,2.5,2.5,2)) | |
676 image(1, ColorLevels, | |
677 matrix(data=ColorLevels, ncol=length(ColorLevels),nrow=1), | |
678 col=ColorRamp, | |
679 xlab="",ylab="", | |
680 xaxt="n") | |
681 layout(1) | |
682 } | |
683 ) |