Mercurial > repos > vmarcon > plot_pca

diff plot_pca.R @ 0:610e86c430a9 draft default tip
planemo upload commit 0b661bcf940c03e11becd42b3321df9573b591b2
author: vmarcon
date: Mon, 23 Oct 2017 09:34:56 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/plot_pca.R	Mon Oct 23 09:34:56 2017 -0400
@@ -0,0 +1,421 @@
+#!/usr/bin/env Rscript
+
+# DEFAULT OPTIONS
+
+opt = list()
+opt$log10 = FALSE
+opt$pseudocount = 1e-04
+opt$row_as_variables = FALSE
+
+suppressPackageStartupMessages(library("optparse"))
+
+options(stringsAsFactors=F)
+
+##################
+# OPTION PARSING
+##################
+
+option_list <- list(
+make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze. Can be stdin"),
+make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
+make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=1e-04),
+make_option(c("-m", "--metadata"), help="A list of tsv files with metadata on matrix experiment.\n\t\tThey must be in the format 'file1.tsv,file2.tsv' and contain a key column named 'labExpId'. Can be omitted"),
+
+make_option(c("--merge_mdata_on"), default="labExpId",
+	help="[default=%default]"),
+
+#make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"),
+make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'),
+
+make_option(c("--sort_color"), type='character', 
+	help="A field for sorting colors. Can be omitted [default=%default]"),
+
+make_option(c("-s", "--shape_by"), default=NULL, type="character", help="choose the fields in the metadata you want to shape by"),
+
+make_option(c("--no_legend"), action="store_true", default=FALSE,
+	help="Do not show the legend [default=%default]"),
+
+make_option(c("-r", "--row_as_variables"), action="store_true", help="select this if you want rows as variables [default=%default]", default=FALSE),
+make_option(c("-C", "--princomp"), help="choose the principal components you want to plot. With 3 PC it gives a 3d plot [default='PC1,PC2']", default="PC1,PC2"),
+
+make_option(c("--print_scores"), action="store_true", default=FALSE, 
+	help="Output the resuling PCs as a separate file with the extension PCs.tsv [default=%default]"),
+
+make_option(c("--print_loadings"), action="store_true", default=FALSE, 
+	help="Output the resulting loadings as a separate file with the extension loadings.tsv [default=%default]"),
+
+make_option(c("--print_lambdas"), action="store_true", default=FALSE,
+	help="Output the resulting lambdas (stdev) as a separate file with the extension lambdas.tsv [default=%default]"),
+
+make_option(c("--biplot"), default=FALSE, action="store_true",
+	help="If active, the factor of the color is used as grouping factor.
+	Centroids are computed and the first <top> loadings are plotted wrt to the two specified components [default=%default]"),
+
+make_option(c("--palette"), default="/users/rg/abreschi/R/palettes/cbbPalette1.15.txt",
+	help="File with the color palette [default=%default]"),
+
+make_option(c("--border"), default=FALSE, action="store_true",
+	help="Black border to dots [default=%default]"),
+
+make_option(c("--shapes"), 
+	help="File with the shapes [default=%default]"),
+
+make_option(c("-L", "--labels"), default=NULL, type="character",
+	help="The metadata field with the labels [default=%default]"),
+
+make_option(c("-B", "--base_size"), default=16, type='numeric',
+	help="Base font size [default=%default]"),
+
+make_option(c("-H", "--height"), default=7,
+	help="Height of the plot in inches [default=%default]"),
+
+make_option(c("-W", "--width"), default=7,
+	help="Width of the plot in inches [default=%default]"),
+
+make_option(c("-o", "--output"), default="pca.out",
+	help="output file name [default=%default]"),
+
+make_option(c("-v", "--verbose"), action='store_true', default=FALSE,
+	help="verbose output [default=%default]")
+)
+
+parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
+arguments <- parse_args(parser, positional_arguments = TRUE)
+opt <- arguments$options
+
+if (opt$verbose) {print(opt)}
+##------------
+## LIBRARIES
+##------------ 
+suppressPackageStartupMessages(library("reshape"))
+suppressPackageStartupMessages(library("ggplot2"))
+suppressPackageStartupMessages(library("grid"))
+
+
+###############
+# BEGIN
+##############
+
+# read input tables
+inF = opt$input_matrix; if (opt$input_matrix == "stdin") {inF = file("stdin")}
+m = read.table(inF, h=T, sep="\t")
+if (opt$verbose) {
+	cat("Sample of input matrix:\n")
+	print(head(m[,1:10]))
+}
+
+
+# Read the color palette
+my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1
+
+# Read the color ordering
+if (is.null(opt$sort_color)) {
+	sort_color=NULL
+}else{
+	sort_color = strsplit(opt$sort_color, ",")[[1]]
+}
+
+# Read the shapes
+if (!is.null(opt$shapes)) {
+	my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1
+}
+
+# remove potential gene id columns
+char_cols <- which(sapply(m, class) == 'character')
+if (length(char_cols) == 0) {genes = rownames(m)}
+if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
+
+if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)}
+
+# apply the log if required
+if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)}
+
+# apply pca
+#if (opt$row_as_variable) {
+#m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE)} else{
+#m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE)}
+#Modified by Gaelle Lefort
+# apply pca
+if (opt$row_as_variable) {
+m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE)
+} else {
+m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE)
+}
+
+
+# Scale the scores for biplot
+#scaledScores = sweep(m_pca$x, 2, m_pca$sdev / sqrt(nrow(m_pca$x)), "/")
+scaledScores = m_pca$x
+
+if (opt$verbose) {print(dim(na.omit(m)))}
+
+# HANDLING METADATA
+
+# add metadata to pca results, they should be input in the command line in the future
+if (is.null(opt$color_by)) {color_by=NULL
+}else{color_by = color_by = strsplit(opt$color_by, ",")[[1]]}
+if (is.null(opt$shape_by)) {shape_by=NULL
+}else{shape_by = strsplit(opt$shape_by, ",")[[1]]}
+
+# read metadata, one or more table to be merged on labExpId
+if (!is.null(opt$metadata)){
+	mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, comment.char="", quote="");
+	if (opt$merge_mdata_on %in% colnames(mdata)) {
+		mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on])
+	}
+
+	if (opt$verbose) {cat('append metadata...')}
+	
+	df = merge(as.data.frame(scaledScores),
+	unique(mdata[c(color_by, shape_by, opt$merge_mdata_on, opt$labels)]), by.x='row.names', by.y=opt$merge_mdata_on, all.x=T)
+	if (opt$verbose) {cat("DONE\n")}
+}else{
+	df = as.data.frame(scaledScores)
+}
+
+
+if (opt$verbose) {print(head(df))}
+
+#########
+# OUTPUT
+#########
+
+output_name = opt$output
+
+# Print text outputs if required
+
+# -- principal components --
+if (opt$print_scores) {
+	write.table(m_pca$x, sprintf("%s.PCs.tsv", output_name), quote=F, sep="\t")
+}
+
+# -- loadings --
+if (opt$print_loadings) {
+	write.table(m_pca$rotation, sprintf("%s.loadings.tsv", output_name), quote=F, sep="\t")
+}
+
+# -- lambdas --
+if (opt$print_lambdas) {
+	perc = round(100*m_pca$sdev/sum(m_pca$sdev), 2)
+	variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2)
+	out_df = data.frame(lambda=m_pca$sdev, perc=perc, var_perc=variances)
+	write.table(out_df, sprintf("%s.lambdas.tsv", output_name), quote=F, sep="\t")
+}
+
+# Read the required components 
+prinComp = strsplit(opt$princomp, ",")[[1]]
+prinComp_i = as.numeric(gsub("PC", "", prinComp))
+
+# Get a vector with all the variance percentages
+variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2)
+
+if (opt$biplot) {
+
+	aggrVar = opt$color_by
+
+	# === Centroids ===
+
+	centroids = aggregate (
+		formula(sprintf(".~%s", aggrVar)), 
+		df[,c(colnames(df)[grep("^PC", colnames(df))], aggrVar)],
+		mean
+	)
+	centroidsM = centroids[,-1]
+	
+	# === Loadings ===
+	
+	vecNorm = function(x) {sqrt(sum(x**2))}
+	
+	scaledLoadings = sweep(m_pca$rotation, 2, m_pca$sdev * nrow(m_pca$x), "*")
+	
+	centroidsNorm = apply(centroidsM[,prinComp], 1, vecNorm)         # DIM: number of levels x 1
+	loadingsNorm = apply(scaledLoadings[,prinComp], 1, vecNorm)      # DIM: number of variables x 1
+	
+	cosine = ( scaledLoadings[,prinComp] %*% t(centroidsM[,prinComp]) ) / (loadingsNorm %*% t(centroidsNorm))
+	cosine = setNames(data.frame(cosine),  centroids[,1])
+
+	closest = setNames(melt(apply(1-cosine, 2, rank)), c("variable", aggrVar, "rank"))
+	write.table( closest, file=sprintf("%s.cosine.tsv", opt$output), quote=F, row.names=F, sep="\t");
+
+	closest_df = data.frame(merge(closest, scaledLoadings, by.x="variable", by.y="row.names"))
+
+	
+}
+
+
+
+#############
+# PLOT
+#############
+
+# plot parameters
+pts = 5
+
+l_col = opt$labels
+base_size = opt$base_size
+
+theme_set(theme_bw(base_size = base_size))
+theme_update(legend.text=element_text(size=0.9*base_size),
+	legend.key.size=unit(0.9*base_size, "points"),
+	legend.key = element_blank()
+)
+
+top = 30
+
+
+# Open device for plotting
+pdf(sprintf("%s.pdf", output_name), w=opt$width, h=opt$height)
+
+if (length(prinComp) == 2){
+
+	geom_params = list()
+	geom_params$size = pts
+#	geom_params$alpha = opt$alpha
+	
+	mapping = list()
+	mapping <- modifyList(mapping, aes_string(x=prinComp[1], y=prinComp[2]))
+
+	if (!is.null(opt$color_by)) {
+		gp_color_by=interaction(df[color_by])
+		if (!is.null(opt$sort_color)) {
+			gp_color_by = factor(gp_color_by, levels=sort_color)
+		}
+		mapping = modifyList(mapping, aes_string(color=gp_color_by, order=gp_color_by))
+	} else {
+		gp_color_by=NULL
+	}
+	
+	if (!is.null(opt$shape_by)) {
+		gp_shape_by=interaction(df[shape_by])
+		if (!is.null(opt$sort_shape)) {
+			gp_shape_by = factor(gp_shape_by, levels=sort_shape)
+		}
+		mapping = modifyList(mapping, aes_string(shape=gp_shape_by, order=gp_shape_by))
+	} else {
+		gp_shape_by=NULL
+	}
+
+#	if (!is.na(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]);
+#	gp_shape_by <- factor(gp_shape_by, levels=sort(levels(gp_shape_by)))
+#	mapping = modifyList(mapping, aes_string(shape=S_col))
+	
+	class(mapping) <- "uneval"
+	
+	pointLayer <- layer(
+		geom = "point",
+	#	geom_params = geom_params,
+		params = geom_params,
+		mapping = mapping,
+		stat = "identity",
+		position = "identity"
+	)
+	
+
+
+
+	# plotting...
+	gp = ggplot(df, aes_string(x=prinComp[1],y=prinComp[2]));
+
+	if (opt$biplot) {
+		gp = gp + geom_point(data=centroids, aes_string(x=prinComp[1], y=prinComp[2], color=opt$color_by), shape=8, size=7)
+		gp = gp + geom_segment( 
+			data=subset(closest_df, rank <= top), 
+			aes_string(x=0, y=0, xend=prinComp[1], yend=prinComp[2], color=opt$color_by)
+		)
+	}
+
+
+	if (opt$border) {
+		if (!is.null(opt$shape_by)) {
+		gp = gp + geom_point(aes(shape=gp_shape_by), col='black', size=pts+1.0);
+		} else {
+		gp =  gp + geom_point(col="black", size=pts+1.0)
+		}
+	}
+
+	gp = gp + pointLayer
+
+#	gp = gp + geom_point(aes(color=gp_color_by))
+#	gp = gp + geom_point(aes(col=gp_color_by, shape=gp_shape_by), size=pts);
+#
+	gp = gp + labs(title="");
+	gp = gp + labs(x=sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]]));
+	gp = gp + labs(y=sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]]));
+
+	gp = gp + scale_color_manual(name=opt$color_by, values=my_palette)
+	if (!is.null(opt$shapes)) {
+		gp = gp + scale_shape_manual(name=opt$shape_by, values=my_shapes);
+	}
+	if (opt$no_legend) {
+		gp = gp + guides(shape=FALSE, color=FALSE)
+	
+}
+	if (!is.null(opt$labels)) {
+		gp = gp + geom_text(aes_string(label=l_col), size=pts)
+	}
+
+	gp
+} 
+
+
+
+
+# --------------------
+#
+# 3d scatterplot
+#
+# --------------------
+
+
+if (length(prinComp) == 3) {
+
+suppressPackageStartupMessages(library(scatterplot3d))
+
+par(xpd=NA, omi=c(0.5, 0.5, 0.5, 1.0))
+
+if (!is.na(opt$color_by)) {gp_color=my_palette[interaction(df[color_by])]} else {gp_color="black"}
+if (!is.null(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]);
+gp_shape_by <- factor(gp_shape_by, levels=sort(intersect(levels(gp_shape_by), gp_shape_by))); gp_shape=my_shapes[gp_shape_by]} else {gp_shape_by=NULL}
+
+plot3d = scatterplot3d(df[prinComp], 
+	color = gp_color,
+	pch = gp_shape,
+	xlab = sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]]),
+	ylab = sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]]),
+	zlab = sprintf('%s (%s%%)', prinComp[3], variances[prinComp_i[3]]),
+	cex.symbols = 1.5,
+	lab = c(5,4)
+)
+
+# !!! To be removed after the mouse paper !!!
+#i=0; for(sample in interaction(df[color_by])) {
+#i=i+1; plot3d$points3d(subset(df, General_category == sample, select=prinComp), type='l', col=gp_color[i])}
+
+if (!is.na(opt$color_by)) {
+	legend(
+		x = log(max(df[prinComp[1]])) + 3,
+#		x = 5,
+		y = 5.5,
+		legend = levels(interaction(df[color_by])), 
+		fill = my_palette[seq_along(levels(interaction(df[color_by])))]
+	)
+}
+
+if (!is.na(opt$shape_by)) {
+	legend(
+#		x = -log(abs(min(df[prinComp[1]]))) - 1.5, 
+		x = -3,
+		y = 6, 
+#		y = 7.2,
+		legend = levels(gp_shape_by), 
+		pch = my_shapes[seq_along(levels(gp_shape_by))]
+		)
+#	legend(-log(abs(min(df[prinComp[1]])))+1.5,7.2,levels(gp_shape_by), 
+#	pch=shapes[seq_along(levels(gp_shape_by))])
+}
+}
+
+
+dev.off()
+q(save='no')
+
author	vmarcon
date	Mon, 23 Oct 2017 09:34:56 -0400
parents
children