Mercurial > repos > vmarcon > plot_pca
comparison plot_pca.R @ 0:610e86c430a9 draft default tip
planemo upload commit 0b661bcf940c03e11becd42b3321df9573b591b2
| author | vmarcon |
|---|---|
| date | Mon, 23 Oct 2017 09:34:56 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:610e86c430a9 |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 | |
| 3 # DEFAULT OPTIONS | |
| 4 | |
| 5 opt = list() | |
| 6 opt$log10 = FALSE | |
| 7 opt$pseudocount = 1e-04 | |
| 8 opt$row_as_variables = FALSE | |
| 9 | |
| 10 suppressPackageStartupMessages(library("optparse")) | |
| 11 | |
| 12 options(stringsAsFactors=F) | |
| 13 | |
| 14 ################## | |
| 15 # OPTION PARSING | |
| 16 ################## | |
| 17 | |
| 18 option_list <- list( | |
| 19 make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze. Can be stdin"), | |
| 20 make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), | |
| 21 make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=1e-04), | |
| 22 make_option(c("-m", "--metadata"), help="A list of tsv files with metadata on matrix experiment.\n\t\tThey must be in the format 'file1.tsv,file2.tsv' and contain a key column named 'labExpId'. Can be omitted"), | |
| 23 | |
| 24 make_option(c("--merge_mdata_on"), default="labExpId", | |
| 25 help="[default=%default]"), | |
| 26 | |
| 27 #make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"), | |
| 28 make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'), | |
| 29 | |
| 30 make_option(c("--sort_color"), type='character', | |
| 31 help="A field for sorting colors. Can be omitted [default=%default]"), | |
| 32 | |
| 33 make_option(c("-s", "--shape_by"), default=NULL, type="character", help="choose the fields in the metadata you want to shape by"), | |
| 34 | |
| 35 make_option(c("--no_legend"), action="store_true", default=FALSE, | |
| 36 help="Do not show the legend [default=%default]"), | |
| 37 | |
| 38 make_option(c("-r", "--row_as_variables"), action="store_true", help="select this if you want rows as variables [default=%default]", default=FALSE), | |
| 39 make_option(c("-C", "--princomp"), help="choose the principal components you want to plot. With 3 PC it gives a 3d plot [default='PC1,PC2']", default="PC1,PC2"), | |
| 40 | |
| 41 make_option(c("--print_scores"), action="store_true", default=FALSE, | |
| 42 help="Output the resuling PCs as a separate file with the extension PCs.tsv [default=%default]"), | |
| 43 | |
| 44 make_option(c("--print_loadings"), action="store_true", default=FALSE, | |
| 45 help="Output the resulting loadings as a separate file with the extension loadings.tsv [default=%default]"), | |
| 46 | |
| 47 make_option(c("--print_lambdas"), action="store_true", default=FALSE, | |
| 48 help="Output the resulting lambdas (stdev) as a separate file with the extension lambdas.tsv [default=%default]"), | |
| 49 | |
| 50 make_option(c("--biplot"), default=FALSE, action="store_true", | |
| 51 help="If active, the factor of the color is used as grouping factor. | |
| 52 Centroids are computed and the first <top> loadings are plotted wrt to the two specified components [default=%default]"), | |
| 53 | |
| 54 make_option(c("--palette"), default="/users/rg/abreschi/R/palettes/cbbPalette1.15.txt", | |
| 55 help="File with the color palette [default=%default]"), | |
| 56 | |
| 57 make_option(c("--border"), default=FALSE, action="store_true", | |
| 58 help="Black border to dots [default=%default]"), | |
| 59 | |
| 60 make_option(c("--shapes"), | |
| 61 help="File with the shapes [default=%default]"), | |
| 62 | |
| 63 make_option(c("-L", "--labels"), default=NULL, type="character", | |
| 64 help="The metadata field with the labels [default=%default]"), | |
| 65 | |
| 66 make_option(c("-B", "--base_size"), default=16, type='numeric', | |
| 67 help="Base font size [default=%default]"), | |
| 68 | |
| 69 make_option(c("-H", "--height"), default=7, | |
| 70 help="Height of the plot in inches [default=%default]"), | |
| 71 | |
| 72 make_option(c("-W", "--width"), default=7, | |
| 73 help="Width of the plot in inches [default=%default]"), | |
| 74 | |
| 75 make_option(c("-o", "--output"), default="pca.out", | |
| 76 help="output file name [default=%default]"), | |
| 77 | |
| 78 make_option(c("-v", "--verbose"), action='store_true', default=FALSE, | |
| 79 help="verbose output [default=%default]") | |
| 80 ) | |
| 81 | |
| 82 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) | |
| 83 arguments <- parse_args(parser, positional_arguments = TRUE) | |
| 84 opt <- arguments$options | |
| 85 | |
| 86 if (opt$verbose) {print(opt)} | |
| 87 ##------------ | |
| 88 ## LIBRARIES | |
| 89 ##------------ | |
| 90 suppressPackageStartupMessages(library("reshape")) | |
| 91 suppressPackageStartupMessages(library("ggplot2")) | |
| 92 suppressPackageStartupMessages(library("grid")) | |
| 93 | |
| 94 | |
| 95 ############### | |
| 96 # BEGIN | |
| 97 ############## | |
| 98 | |
| 99 # read input tables | |
| 100 inF = opt$input_matrix; if (opt$input_matrix == "stdin") {inF = file("stdin")} | |
| 101 m = read.table(inF, h=T, sep="\t") | |
| 102 if (opt$verbose) { | |
| 103 cat("Sample of input matrix:\n") | |
| 104 print(head(m[,1:10])) | |
| 105 } | |
| 106 | |
| 107 | |
| 108 # Read the color palette | |
| 109 my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1 | |
| 110 | |
| 111 # Read the color ordering | |
| 112 if (is.null(opt$sort_color)) { | |
| 113 sort_color=NULL | |
| 114 }else{ | |
| 115 sort_color = strsplit(opt$sort_color, ",")[[1]] | |
| 116 } | |
| 117 | |
| 118 # Read the shapes | |
| 119 if (!is.null(opt$shapes)) { | |
| 120 my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1 | |
| 121 } | |
| 122 | |
| 123 # remove potential gene id columns | |
| 124 char_cols <- which(sapply(m, class) == 'character') | |
| 125 if (length(char_cols) == 0) {genes = rownames(m)} | |
| 126 if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} | |
| 127 | |
| 128 if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)} | |
| 129 | |
| 130 # apply the log if required | |
| 131 if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)} | |
| 132 | |
| 133 # apply pca | |
| 134 #if (opt$row_as_variable) { | |
| 135 #m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE)} else{ | |
| 136 #m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE)} | |
| 137 #Modified by Gaelle Lefort | |
| 138 # apply pca | |
| 139 if (opt$row_as_variable) { | |
| 140 m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE) | |
| 141 } else { | |
| 142 m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE) | |
| 143 } | |
| 144 | |
| 145 | |
| 146 # Scale the scores for biplot | |
| 147 #scaledScores = sweep(m_pca$x, 2, m_pca$sdev / sqrt(nrow(m_pca$x)), "/") | |
| 148 scaledScores = m_pca$x | |
| 149 | |
| 150 if (opt$verbose) {print(dim(na.omit(m)))} | |
| 151 | |
| 152 # HANDLING METADATA | |
| 153 | |
| 154 # add metadata to pca results, they should be input in the command line in the future | |
| 155 if (is.null(opt$color_by)) {color_by=NULL | |
| 156 }else{color_by = color_by = strsplit(opt$color_by, ",")[[1]]} | |
| 157 if (is.null(opt$shape_by)) {shape_by=NULL | |
| 158 }else{shape_by = strsplit(opt$shape_by, ",")[[1]]} | |
| 159 | |
| 160 # read metadata, one or more table to be merged on labExpId | |
| 161 if (!is.null(opt$metadata)){ | |
| 162 mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, comment.char="", quote=""); | |
| 163 if (opt$merge_mdata_on %in% colnames(mdata)) { | |
| 164 mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on]) | |
| 165 } | |
| 166 | |
| 167 if (opt$verbose) {cat('append metadata...')} | |
| 168 | |
| 169 df = merge(as.data.frame(scaledScores), | |
| 170 unique(mdata[c(color_by, shape_by, opt$merge_mdata_on, opt$labels)]), by.x='row.names', by.y=opt$merge_mdata_on, all.x=T) | |
| 171 if (opt$verbose) {cat("DONE\n")} | |
| 172 }else{ | |
| 173 df = as.data.frame(scaledScores) | |
| 174 } | |
| 175 | |
| 176 | |
| 177 if (opt$verbose) {print(head(df))} | |
| 178 | |
| 179 ######### | |
| 180 # OUTPUT | |
| 181 ######### | |
| 182 | |
| 183 output_name = opt$output | |
| 184 | |
| 185 # Print text outputs if required | |
| 186 | |
| 187 # -- principal components -- | |
| 188 if (opt$print_scores) { | |
| 189 write.table(m_pca$x, sprintf("%s.PCs.tsv", output_name), quote=F, sep="\t") | |
| 190 } | |
| 191 | |
| 192 # -- loadings -- | |
| 193 if (opt$print_loadings) { | |
| 194 write.table(m_pca$rotation, sprintf("%s.loadings.tsv", output_name), quote=F, sep="\t") | |
| 195 } | |
| 196 | |
| 197 # -- lambdas -- | |
| 198 if (opt$print_lambdas) { | |
| 199 perc = round(100*m_pca$sdev/sum(m_pca$sdev), 2) | |
| 200 variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2) | |
| 201 out_df = data.frame(lambda=m_pca$sdev, perc=perc, var_perc=variances) | |
| 202 write.table(out_df, sprintf("%s.lambdas.tsv", output_name), quote=F, sep="\t") | |
| 203 } | |
| 204 | |
| 205 # Read the required components | |
| 206 prinComp = strsplit(opt$princomp, ",")[[1]] | |
| 207 prinComp_i = as.numeric(gsub("PC", "", prinComp)) | |
| 208 | |
| 209 # Get a vector with all the variance percentages | |
| 210 variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2) | |
| 211 | |
| 212 if (opt$biplot) { | |
| 213 | |
| 214 aggrVar = opt$color_by | |
| 215 | |
| 216 # === Centroids === | |
| 217 | |
| 218 centroids = aggregate ( | |
| 219 formula(sprintf(".~%s", aggrVar)), | |
| 220 df[,c(colnames(df)[grep("^PC", colnames(df))], aggrVar)], | |
| 221 mean | |
| 222 ) | |
| 223 centroidsM = centroids[,-1] | |
| 224 | |
| 225 # === Loadings === | |
| 226 | |
| 227 vecNorm = function(x) {sqrt(sum(x**2))} | |
| 228 | |
| 229 scaledLoadings = sweep(m_pca$rotation, 2, m_pca$sdev * nrow(m_pca$x), "*") | |
| 230 | |
| 231 centroidsNorm = apply(centroidsM[,prinComp], 1, vecNorm) # DIM: number of levels x 1 | |
| 232 loadingsNorm = apply(scaledLoadings[,prinComp], 1, vecNorm) # DIM: number of variables x 1 | |
| 233 | |
| 234 cosine = ( scaledLoadings[,prinComp] %*% t(centroidsM[,prinComp]) ) / (loadingsNorm %*% t(centroidsNorm)) | |
| 235 cosine = setNames(data.frame(cosine), centroids[,1]) | |
| 236 | |
| 237 closest = setNames(melt(apply(1-cosine, 2, rank)), c("variable", aggrVar, "rank")) | |
| 238 write.table( closest, file=sprintf("%s.cosine.tsv", opt$output), quote=F, row.names=F, sep="\t"); | |
| 239 | |
| 240 closest_df = data.frame(merge(closest, scaledLoadings, by.x="variable", by.y="row.names")) | |
| 241 | |
| 242 | |
| 243 } | |
| 244 | |
| 245 | |
| 246 | |
| 247 ############# | |
| 248 # PLOT | |
| 249 ############# | |
| 250 | |
| 251 # plot parameters | |
| 252 pts = 5 | |
| 253 | |
| 254 l_col = opt$labels | |
| 255 base_size = opt$base_size | |
| 256 | |
| 257 theme_set(theme_bw(base_size = base_size)) | |
| 258 theme_update(legend.text=element_text(size=0.9*base_size), | |
| 259 legend.key.size=unit(0.9*base_size, "points"), | |
| 260 legend.key = element_blank() | |
| 261 ) | |
| 262 | |
| 263 top = 30 | |
| 264 | |
| 265 | |
| 266 # Open device for plotting | |
| 267 pdf(sprintf("%s.pdf", output_name), w=opt$width, h=opt$height) | |
| 268 | |
| 269 if (length(prinComp) == 2){ | |
| 270 | |
| 271 geom_params = list() | |
| 272 geom_params$size = pts | |
| 273 # geom_params$alpha = opt$alpha | |
| 274 | |
| 275 mapping = list() | |
| 276 mapping <- modifyList(mapping, aes_string(x=prinComp[1], y=prinComp[2])) | |
| 277 | |
| 278 if (!is.null(opt$color_by)) { | |
| 279 gp_color_by=interaction(df[color_by]) | |
| 280 if (!is.null(opt$sort_color)) { | |
| 281 gp_color_by = factor(gp_color_by, levels=sort_color) | |
| 282 } | |
| 283 mapping = modifyList(mapping, aes_string(color=gp_color_by, order=gp_color_by)) | |
| 284 } else { | |
| 285 gp_color_by=NULL | |
| 286 } | |
| 287 | |
| 288 if (!is.null(opt$shape_by)) { | |
| 289 gp_shape_by=interaction(df[shape_by]) | |
| 290 if (!is.null(opt$sort_shape)) { | |
| 291 gp_shape_by = factor(gp_shape_by, levels=sort_shape) | |
| 292 } | |
| 293 mapping = modifyList(mapping, aes_string(shape=gp_shape_by, order=gp_shape_by)) | |
| 294 } else { | |
| 295 gp_shape_by=NULL | |
| 296 } | |
| 297 | |
| 298 # if (!is.na(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]); | |
| 299 # gp_shape_by <- factor(gp_shape_by, levels=sort(levels(gp_shape_by))) | |
| 300 # mapping = modifyList(mapping, aes_string(shape=S_col)) | |
| 301 | |
| 302 class(mapping) <- "uneval" | |
| 303 | |
| 304 pointLayer <- layer( | |
| 305 geom = "point", | |
| 306 # geom_params = geom_params, | |
| 307 params = geom_params, | |
| 308 mapping = mapping, | |
| 309 stat = "identity", | |
| 310 position = "identity" | |
| 311 ) | |
| 312 | |
| 313 | |
| 314 | |
| 315 | |
| 316 # plotting... | |
| 317 gp = ggplot(df, aes_string(x=prinComp[1],y=prinComp[2])); | |
| 318 | |
| 319 if (opt$biplot) { | |
| 320 gp = gp + geom_point(data=centroids, aes_string(x=prinComp[1], y=prinComp[2], color=opt$color_by), shape=8, size=7) | |
| 321 gp = gp + geom_segment( | |
| 322 data=subset(closest_df, rank <= top), | |
| 323 aes_string(x=0, y=0, xend=prinComp[1], yend=prinComp[2], color=opt$color_by) | |
| 324 ) | |
| 325 } | |
| 326 | |
| 327 | |
| 328 if (opt$border) { | |
| 329 if (!is.null(opt$shape_by)) { | |
| 330 gp = gp + geom_point(aes(shape=gp_shape_by), col='black', size=pts+1.0); | |
| 331 } else { | |
| 332 gp = gp + geom_point(col="black", size=pts+1.0) | |
| 333 } | |
| 334 } | |
| 335 | |
| 336 gp = gp + pointLayer | |
| 337 | |
| 338 # gp = gp + geom_point(aes(color=gp_color_by)) | |
| 339 # gp = gp + geom_point(aes(col=gp_color_by, shape=gp_shape_by), size=pts); | |
| 340 # | |
| 341 gp = gp + labs(title=""); | |
| 342 gp = gp + labs(x=sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]])); | |
| 343 gp = gp + labs(y=sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]])); | |
| 344 | |
| 345 gp = gp + scale_color_manual(name=opt$color_by, values=my_palette) | |
| 346 if (!is.null(opt$shapes)) { | |
| 347 gp = gp + scale_shape_manual(name=opt$shape_by, values=my_shapes); | |
| 348 } | |
| 349 if (opt$no_legend) { | |
| 350 gp = gp + guides(shape=FALSE, color=FALSE) | |
| 351 | |
| 352 } | |
| 353 if (!is.null(opt$labels)) { | |
| 354 gp = gp + geom_text(aes_string(label=l_col), size=pts) | |
| 355 } | |
| 356 | |
| 357 gp | |
| 358 } | |
| 359 | |
| 360 | |
| 361 | |
| 362 | |
| 363 # -------------------- | |
| 364 # | |
| 365 # 3d scatterplot | |
| 366 # | |
| 367 # -------------------- | |
| 368 | |
| 369 | |
| 370 if (length(prinComp) == 3) { | |
| 371 | |
| 372 suppressPackageStartupMessages(library(scatterplot3d)) | |
| 373 | |
| 374 par(xpd=NA, omi=c(0.5, 0.5, 0.5, 1.0)) | |
| 375 | |
| 376 if (!is.na(opt$color_by)) {gp_color=my_palette[interaction(df[color_by])]} else {gp_color="black"} | |
| 377 if (!is.null(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]); | |
| 378 gp_shape_by <- factor(gp_shape_by, levels=sort(intersect(levels(gp_shape_by), gp_shape_by))); gp_shape=my_shapes[gp_shape_by]} else {gp_shape_by=NULL} | |
| 379 | |
| 380 plot3d = scatterplot3d(df[prinComp], | |
| 381 color = gp_color, | |
| 382 pch = gp_shape, | |
| 383 xlab = sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]]), | |
| 384 ylab = sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]]), | |
| 385 zlab = sprintf('%s (%s%%)', prinComp[3], variances[prinComp_i[3]]), | |
| 386 cex.symbols = 1.5, | |
| 387 lab = c(5,4) | |
| 388 ) | |
| 389 | |
| 390 # !!! To be removed after the mouse paper !!! | |
| 391 #i=0; for(sample in interaction(df[color_by])) { | |
| 392 #i=i+1; plot3d$points3d(subset(df, General_category == sample, select=prinComp), type='l', col=gp_color[i])} | |
| 393 | |
| 394 if (!is.na(opt$color_by)) { | |
| 395 legend( | |
| 396 x = log(max(df[prinComp[1]])) + 3, | |
| 397 # x = 5, | |
| 398 y = 5.5, | |
| 399 legend = levels(interaction(df[color_by])), | |
| 400 fill = my_palette[seq_along(levels(interaction(df[color_by])))] | |
| 401 ) | |
| 402 } | |
| 403 | |
| 404 if (!is.na(opt$shape_by)) { | |
| 405 legend( | |
| 406 # x = -log(abs(min(df[prinComp[1]]))) - 1.5, | |
| 407 x = -3, | |
| 408 y = 6, | |
| 409 # y = 7.2, | |
| 410 legend = levels(gp_shape_by), | |
| 411 pch = my_shapes[seq_along(levels(gp_shape_by))] | |
| 412 ) | |
| 413 # legend(-log(abs(min(df[prinComp[1]])))+1.5,7.2,levels(gp_shape_by), | |
| 414 # pch=shapes[seq_along(levels(gp_shape_by))]) | |
| 415 } | |
| 416 } | |
| 417 | |
| 418 | |
| 419 dev.off() | |
| 420 q(save='no') | |
| 421 |
