Mercurial > repos > vmarcon > plot_pca
diff plot_pca.R @ 0:610e86c430a9 draft default tip
planemo upload commit 0b661bcf940c03e11becd42b3321df9573b591b2
author | vmarcon |
---|---|
date | Mon, 23 Oct 2017 09:34:56 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plot_pca.R Mon Oct 23 09:34:56 2017 -0400 @@ -0,0 +1,421 @@ +#!/usr/bin/env Rscript + +# DEFAULT OPTIONS + +opt = list() +opt$log10 = FALSE +opt$pseudocount = 1e-04 +opt$row_as_variables = FALSE + +suppressPackageStartupMessages(library("optparse")) + +options(stringsAsFactors=F) + +################## +# OPTION PARSING +################## + +option_list <- list( +make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze. Can be stdin"), +make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), +make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=1e-04), +make_option(c("-m", "--metadata"), help="A list of tsv files with metadata on matrix experiment.\n\t\tThey must be in the format 'file1.tsv,file2.tsv' and contain a key column named 'labExpId'. Can be omitted"), + +make_option(c("--merge_mdata_on"), default="labExpId", + help="[default=%default]"), + +#make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"), +make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'), + +make_option(c("--sort_color"), type='character', + help="A field for sorting colors. Can be omitted [default=%default]"), + +make_option(c("-s", "--shape_by"), default=NULL, type="character", help="choose the fields in the metadata you want to shape by"), + +make_option(c("--no_legend"), action="store_true", default=FALSE, + help="Do not show the legend [default=%default]"), + +make_option(c("-r", "--row_as_variables"), action="store_true", help="select this if you want rows as variables [default=%default]", default=FALSE), +make_option(c("-C", "--princomp"), help="choose the principal components you want to plot. With 3 PC it gives a 3d plot [default='PC1,PC2']", default="PC1,PC2"), + +make_option(c("--print_scores"), action="store_true", default=FALSE, + help="Output the resuling PCs as a separate file with the extension PCs.tsv [default=%default]"), + +make_option(c("--print_loadings"), action="store_true", default=FALSE, + help="Output the resulting loadings as a separate file with the extension loadings.tsv [default=%default]"), + +make_option(c("--print_lambdas"), action="store_true", default=FALSE, + help="Output the resulting lambdas (stdev) as a separate file with the extension lambdas.tsv [default=%default]"), + +make_option(c("--biplot"), default=FALSE, action="store_true", + help="If active, the factor of the color is used as grouping factor. + Centroids are computed and the first <top> loadings are plotted wrt to the two specified components [default=%default]"), + +make_option(c("--palette"), default="/users/rg/abreschi/R/palettes/cbbPalette1.15.txt", + help="File with the color palette [default=%default]"), + +make_option(c("--border"), default=FALSE, action="store_true", + help="Black border to dots [default=%default]"), + +make_option(c("--shapes"), + help="File with the shapes [default=%default]"), + +make_option(c("-L", "--labels"), default=NULL, type="character", + help="The metadata field with the labels [default=%default]"), + +make_option(c("-B", "--base_size"), default=16, type='numeric', + help="Base font size [default=%default]"), + +make_option(c("-H", "--height"), default=7, + help="Height of the plot in inches [default=%default]"), + +make_option(c("-W", "--width"), default=7, + help="Width of the plot in inches [default=%default]"), + +make_option(c("-o", "--output"), default="pca.out", + help="output file name [default=%default]"), + +make_option(c("-v", "--verbose"), action='store_true', default=FALSE, + help="verbose output [default=%default]") +) + +parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) +arguments <- parse_args(parser, positional_arguments = TRUE) +opt <- arguments$options + +if (opt$verbose) {print(opt)} +##------------ +## LIBRARIES +##------------ +suppressPackageStartupMessages(library("reshape")) +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("grid")) + + +############### +# BEGIN +############## + +# read input tables +inF = opt$input_matrix; if (opt$input_matrix == "stdin") {inF = file("stdin")} +m = read.table(inF, h=T, sep="\t") +if (opt$verbose) { + cat("Sample of input matrix:\n") + print(head(m[,1:10])) +} + + +# Read the color palette +my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1 + +# Read the color ordering +if (is.null(opt$sort_color)) { + sort_color=NULL +}else{ + sort_color = strsplit(opt$sort_color, ",")[[1]] +} + +# Read the shapes +if (!is.null(opt$shapes)) { + my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1 +} + +# remove potential gene id columns +char_cols <- which(sapply(m, class) == 'character') +if (length(char_cols) == 0) {genes = rownames(m)} +if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} + +if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)} + +# apply the log if required +if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)} + +# apply pca +#if (opt$row_as_variable) { +#m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE)} else{ +#m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE)} +#Modified by Gaelle Lefort +# apply pca +if (opt$row_as_variable) { +m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE) +} else { +m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE) +} + + +# Scale the scores for biplot +#scaledScores = sweep(m_pca$x, 2, m_pca$sdev / sqrt(nrow(m_pca$x)), "/") +scaledScores = m_pca$x + +if (opt$verbose) {print(dim(na.omit(m)))} + +# HANDLING METADATA + +# add metadata to pca results, they should be input in the command line in the future +if (is.null(opt$color_by)) {color_by=NULL +}else{color_by = color_by = strsplit(opt$color_by, ",")[[1]]} +if (is.null(opt$shape_by)) {shape_by=NULL +}else{shape_by = strsplit(opt$shape_by, ",")[[1]]} + +# read metadata, one or more table to be merged on labExpId +if (!is.null(opt$metadata)){ + mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, comment.char="", quote=""); + if (opt$merge_mdata_on %in% colnames(mdata)) { + mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on]) + } + + if (opt$verbose) {cat('append metadata...')} + + df = merge(as.data.frame(scaledScores), + unique(mdata[c(color_by, shape_by, opt$merge_mdata_on, opt$labels)]), by.x='row.names', by.y=opt$merge_mdata_on, all.x=T) + if (opt$verbose) {cat("DONE\n")} +}else{ + df = as.data.frame(scaledScores) +} + + +if (opt$verbose) {print(head(df))} + +######### +# OUTPUT +######### + +output_name = opt$output + +# Print text outputs if required + +# -- principal components -- +if (opt$print_scores) { + write.table(m_pca$x, sprintf("%s.PCs.tsv", output_name), quote=F, sep="\t") +} + +# -- loadings -- +if (opt$print_loadings) { + write.table(m_pca$rotation, sprintf("%s.loadings.tsv", output_name), quote=F, sep="\t") +} + +# -- lambdas -- +if (opt$print_lambdas) { + perc = round(100*m_pca$sdev/sum(m_pca$sdev), 2) + variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2) + out_df = data.frame(lambda=m_pca$sdev, perc=perc, var_perc=variances) + write.table(out_df, sprintf("%s.lambdas.tsv", output_name), quote=F, sep="\t") +} + +# Read the required components +prinComp = strsplit(opt$princomp, ",")[[1]] +prinComp_i = as.numeric(gsub("PC", "", prinComp)) + +# Get a vector with all the variance percentages +variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2) + +if (opt$biplot) { + + aggrVar = opt$color_by + + # === Centroids === + + centroids = aggregate ( + formula(sprintf(".~%s", aggrVar)), + df[,c(colnames(df)[grep("^PC", colnames(df))], aggrVar)], + mean + ) + centroidsM = centroids[,-1] + + # === Loadings === + + vecNorm = function(x) {sqrt(sum(x**2))} + + scaledLoadings = sweep(m_pca$rotation, 2, m_pca$sdev * nrow(m_pca$x), "*") + + centroidsNorm = apply(centroidsM[,prinComp], 1, vecNorm) # DIM: number of levels x 1 + loadingsNorm = apply(scaledLoadings[,prinComp], 1, vecNorm) # DIM: number of variables x 1 + + cosine = ( scaledLoadings[,prinComp] %*% t(centroidsM[,prinComp]) ) / (loadingsNorm %*% t(centroidsNorm)) + cosine = setNames(data.frame(cosine), centroids[,1]) + + closest = setNames(melt(apply(1-cosine, 2, rank)), c("variable", aggrVar, "rank")) + write.table( closest, file=sprintf("%s.cosine.tsv", opt$output), quote=F, row.names=F, sep="\t"); + + closest_df = data.frame(merge(closest, scaledLoadings, by.x="variable", by.y="row.names")) + + +} + + + +############# +# PLOT +############# + +# plot parameters +pts = 5 + +l_col = opt$labels +base_size = opt$base_size + +theme_set(theme_bw(base_size = base_size)) +theme_update(legend.text=element_text(size=0.9*base_size), + legend.key.size=unit(0.9*base_size, "points"), + legend.key = element_blank() +) + +top = 30 + + +# Open device for plotting +pdf(sprintf("%s.pdf", output_name), w=opt$width, h=opt$height) + +if (length(prinComp) == 2){ + + geom_params = list() + geom_params$size = pts +# geom_params$alpha = opt$alpha + + mapping = list() + mapping <- modifyList(mapping, aes_string(x=prinComp[1], y=prinComp[2])) + + if (!is.null(opt$color_by)) { + gp_color_by=interaction(df[color_by]) + if (!is.null(opt$sort_color)) { + gp_color_by = factor(gp_color_by, levels=sort_color) + } + mapping = modifyList(mapping, aes_string(color=gp_color_by, order=gp_color_by)) + } else { + gp_color_by=NULL + } + + if (!is.null(opt$shape_by)) { + gp_shape_by=interaction(df[shape_by]) + if (!is.null(opt$sort_shape)) { + gp_shape_by = factor(gp_shape_by, levels=sort_shape) + } + mapping = modifyList(mapping, aes_string(shape=gp_shape_by, order=gp_shape_by)) + } else { + gp_shape_by=NULL + } + +# if (!is.na(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]); +# gp_shape_by <- factor(gp_shape_by, levels=sort(levels(gp_shape_by))) +# mapping = modifyList(mapping, aes_string(shape=S_col)) + + class(mapping) <- "uneval" + + pointLayer <- layer( + geom = "point", + # geom_params = geom_params, + params = geom_params, + mapping = mapping, + stat = "identity", + position = "identity" + ) + + + + + # plotting... + gp = ggplot(df, aes_string(x=prinComp[1],y=prinComp[2])); + + if (opt$biplot) { + gp = gp + geom_point(data=centroids, aes_string(x=prinComp[1], y=prinComp[2], color=opt$color_by), shape=8, size=7) + gp = gp + geom_segment( + data=subset(closest_df, rank <= top), + aes_string(x=0, y=0, xend=prinComp[1], yend=prinComp[2], color=opt$color_by) + ) + } + + + if (opt$border) { + if (!is.null(opt$shape_by)) { + gp = gp + geom_point(aes(shape=gp_shape_by), col='black', size=pts+1.0); + } else { + gp = gp + geom_point(col="black", size=pts+1.0) + } + } + + gp = gp + pointLayer + +# gp = gp + geom_point(aes(color=gp_color_by)) +# gp = gp + geom_point(aes(col=gp_color_by, shape=gp_shape_by), size=pts); +# + gp = gp + labs(title=""); + gp = gp + labs(x=sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]])); + gp = gp + labs(y=sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]])); + + gp = gp + scale_color_manual(name=opt$color_by, values=my_palette) + if (!is.null(opt$shapes)) { + gp = gp + scale_shape_manual(name=opt$shape_by, values=my_shapes); + } + if (opt$no_legend) { + gp = gp + guides(shape=FALSE, color=FALSE) + +} + if (!is.null(opt$labels)) { + gp = gp + geom_text(aes_string(label=l_col), size=pts) + } + + gp +} + + + + +# -------------------- +# +# 3d scatterplot +# +# -------------------- + + +if (length(prinComp) == 3) { + +suppressPackageStartupMessages(library(scatterplot3d)) + +par(xpd=NA, omi=c(0.5, 0.5, 0.5, 1.0)) + +if (!is.na(opt$color_by)) {gp_color=my_palette[interaction(df[color_by])]} else {gp_color="black"} +if (!is.null(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]); +gp_shape_by <- factor(gp_shape_by, levels=sort(intersect(levels(gp_shape_by), gp_shape_by))); gp_shape=my_shapes[gp_shape_by]} else {gp_shape_by=NULL} + +plot3d = scatterplot3d(df[prinComp], + color = gp_color, + pch = gp_shape, + xlab = sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]]), + ylab = sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]]), + zlab = sprintf('%s (%s%%)', prinComp[3], variances[prinComp_i[3]]), + cex.symbols = 1.5, + lab = c(5,4) +) + +# !!! To be removed after the mouse paper !!! +#i=0; for(sample in interaction(df[color_by])) { +#i=i+1; plot3d$points3d(subset(df, General_category == sample, select=prinComp), type='l', col=gp_color[i])} + +if (!is.na(opt$color_by)) { + legend( + x = log(max(df[prinComp[1]])) + 3, +# x = 5, + y = 5.5, + legend = levels(interaction(df[color_by])), + fill = my_palette[seq_along(levels(interaction(df[color_by])))] + ) +} + +if (!is.na(opt$shape_by)) { + legend( +# x = -log(abs(min(df[prinComp[1]]))) - 1.5, + x = -3, + y = 6, +# y = 7.2, + legend = levels(gp_shape_by), + pch = my_shapes[seq_along(levels(gp_shape_by))] + ) +# legend(-log(abs(min(df[prinComp[1]])))+1.5,7.2,levels(gp_shape_by), +# pch=shapes[seq_along(levels(gp_shape_by))]) +} +} + + +dev.off() +q(save='no') +