Mercurial > repos > vmarcon > plot_pca
comparison plot_pca.R @ 0:610e86c430a9 draft default tip
planemo upload commit 0b661bcf940c03e11becd42b3321df9573b591b2
author | vmarcon |
---|---|
date | Mon, 23 Oct 2017 09:34:56 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:610e86c430a9 |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 # DEFAULT OPTIONS | |
4 | |
5 opt = list() | |
6 opt$log10 = FALSE | |
7 opt$pseudocount = 1e-04 | |
8 opt$row_as_variables = FALSE | |
9 | |
10 suppressPackageStartupMessages(library("optparse")) | |
11 | |
12 options(stringsAsFactors=F) | |
13 | |
14 ################## | |
15 # OPTION PARSING | |
16 ################## | |
17 | |
18 option_list <- list( | |
19 make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze. Can be stdin"), | |
20 make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), | |
21 make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=1e-04), | |
22 make_option(c("-m", "--metadata"), help="A list of tsv files with metadata on matrix experiment.\n\t\tThey must be in the format 'file1.tsv,file2.tsv' and contain a key column named 'labExpId'. Can be omitted"), | |
23 | |
24 make_option(c("--merge_mdata_on"), default="labExpId", | |
25 help="[default=%default]"), | |
26 | |
27 #make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"), | |
28 make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'), | |
29 | |
30 make_option(c("--sort_color"), type='character', | |
31 help="A field for sorting colors. Can be omitted [default=%default]"), | |
32 | |
33 make_option(c("-s", "--shape_by"), default=NULL, type="character", help="choose the fields in the metadata you want to shape by"), | |
34 | |
35 make_option(c("--no_legend"), action="store_true", default=FALSE, | |
36 help="Do not show the legend [default=%default]"), | |
37 | |
38 make_option(c("-r", "--row_as_variables"), action="store_true", help="select this if you want rows as variables [default=%default]", default=FALSE), | |
39 make_option(c("-C", "--princomp"), help="choose the principal components you want to plot. With 3 PC it gives a 3d plot [default='PC1,PC2']", default="PC1,PC2"), | |
40 | |
41 make_option(c("--print_scores"), action="store_true", default=FALSE, | |
42 help="Output the resuling PCs as a separate file with the extension PCs.tsv [default=%default]"), | |
43 | |
44 make_option(c("--print_loadings"), action="store_true", default=FALSE, | |
45 help="Output the resulting loadings as a separate file with the extension loadings.tsv [default=%default]"), | |
46 | |
47 make_option(c("--print_lambdas"), action="store_true", default=FALSE, | |
48 help="Output the resulting lambdas (stdev) as a separate file with the extension lambdas.tsv [default=%default]"), | |
49 | |
50 make_option(c("--biplot"), default=FALSE, action="store_true", | |
51 help="If active, the factor of the color is used as grouping factor. | |
52 Centroids are computed and the first <top> loadings are plotted wrt to the two specified components [default=%default]"), | |
53 | |
54 make_option(c("--palette"), default="/users/rg/abreschi/R/palettes/cbbPalette1.15.txt", | |
55 help="File with the color palette [default=%default]"), | |
56 | |
57 make_option(c("--border"), default=FALSE, action="store_true", | |
58 help="Black border to dots [default=%default]"), | |
59 | |
60 make_option(c("--shapes"), | |
61 help="File with the shapes [default=%default]"), | |
62 | |
63 make_option(c("-L", "--labels"), default=NULL, type="character", | |
64 help="The metadata field with the labels [default=%default]"), | |
65 | |
66 make_option(c("-B", "--base_size"), default=16, type='numeric', | |
67 help="Base font size [default=%default]"), | |
68 | |
69 make_option(c("-H", "--height"), default=7, | |
70 help="Height of the plot in inches [default=%default]"), | |
71 | |
72 make_option(c("-W", "--width"), default=7, | |
73 help="Width of the plot in inches [default=%default]"), | |
74 | |
75 make_option(c("-o", "--output"), default="pca.out", | |
76 help="output file name [default=%default]"), | |
77 | |
78 make_option(c("-v", "--verbose"), action='store_true', default=FALSE, | |
79 help="verbose output [default=%default]") | |
80 ) | |
81 | |
82 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) | |
83 arguments <- parse_args(parser, positional_arguments = TRUE) | |
84 opt <- arguments$options | |
85 | |
86 if (opt$verbose) {print(opt)} | |
87 ##------------ | |
88 ## LIBRARIES | |
89 ##------------ | |
90 suppressPackageStartupMessages(library("reshape")) | |
91 suppressPackageStartupMessages(library("ggplot2")) | |
92 suppressPackageStartupMessages(library("grid")) | |
93 | |
94 | |
95 ############### | |
96 # BEGIN | |
97 ############## | |
98 | |
99 # read input tables | |
100 inF = opt$input_matrix; if (opt$input_matrix == "stdin") {inF = file("stdin")} | |
101 m = read.table(inF, h=T, sep="\t") | |
102 if (opt$verbose) { | |
103 cat("Sample of input matrix:\n") | |
104 print(head(m[,1:10])) | |
105 } | |
106 | |
107 | |
108 # Read the color palette | |
109 my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1 | |
110 | |
111 # Read the color ordering | |
112 if (is.null(opt$sort_color)) { | |
113 sort_color=NULL | |
114 }else{ | |
115 sort_color = strsplit(opt$sort_color, ",")[[1]] | |
116 } | |
117 | |
118 # Read the shapes | |
119 if (!is.null(opt$shapes)) { | |
120 my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1 | |
121 } | |
122 | |
123 # remove potential gene id columns | |
124 char_cols <- which(sapply(m, class) == 'character') | |
125 if (length(char_cols) == 0) {genes = rownames(m)} | |
126 if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} | |
127 | |
128 if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)} | |
129 | |
130 # apply the log if required | |
131 if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)} | |
132 | |
133 # apply pca | |
134 #if (opt$row_as_variable) { | |
135 #m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE)} else{ | |
136 #m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE)} | |
137 #Modified by Gaelle Lefort | |
138 # apply pca | |
139 if (opt$row_as_variable) { | |
140 m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE) | |
141 } else { | |
142 m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE) | |
143 } | |
144 | |
145 | |
146 # Scale the scores for biplot | |
147 #scaledScores = sweep(m_pca$x, 2, m_pca$sdev / sqrt(nrow(m_pca$x)), "/") | |
148 scaledScores = m_pca$x | |
149 | |
150 if (opt$verbose) {print(dim(na.omit(m)))} | |
151 | |
152 # HANDLING METADATA | |
153 | |
154 # add metadata to pca results, they should be input in the command line in the future | |
155 if (is.null(opt$color_by)) {color_by=NULL | |
156 }else{color_by = color_by = strsplit(opt$color_by, ",")[[1]]} | |
157 if (is.null(opt$shape_by)) {shape_by=NULL | |
158 }else{shape_by = strsplit(opt$shape_by, ",")[[1]]} | |
159 | |
160 # read metadata, one or more table to be merged on labExpId | |
161 if (!is.null(opt$metadata)){ | |
162 mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, comment.char="", quote=""); | |
163 if (opt$merge_mdata_on %in% colnames(mdata)) { | |
164 mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on]) | |
165 } | |
166 | |
167 if (opt$verbose) {cat('append metadata...')} | |
168 | |
169 df = merge(as.data.frame(scaledScores), | |
170 unique(mdata[c(color_by, shape_by, opt$merge_mdata_on, opt$labels)]), by.x='row.names', by.y=opt$merge_mdata_on, all.x=T) | |
171 if (opt$verbose) {cat("DONE\n")} | |
172 }else{ | |
173 df = as.data.frame(scaledScores) | |
174 } | |
175 | |
176 | |
177 if (opt$verbose) {print(head(df))} | |
178 | |
179 ######### | |
180 # OUTPUT | |
181 ######### | |
182 | |
183 output_name = opt$output | |
184 | |
185 # Print text outputs if required | |
186 | |
187 # -- principal components -- | |
188 if (opt$print_scores) { | |
189 write.table(m_pca$x, sprintf("%s.PCs.tsv", output_name), quote=F, sep="\t") | |
190 } | |
191 | |
192 # -- loadings -- | |
193 if (opt$print_loadings) { | |
194 write.table(m_pca$rotation, sprintf("%s.loadings.tsv", output_name), quote=F, sep="\t") | |
195 } | |
196 | |
197 # -- lambdas -- | |
198 if (opt$print_lambdas) { | |
199 perc = round(100*m_pca$sdev/sum(m_pca$sdev), 2) | |
200 variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2) | |
201 out_df = data.frame(lambda=m_pca$sdev, perc=perc, var_perc=variances) | |
202 write.table(out_df, sprintf("%s.lambdas.tsv", output_name), quote=F, sep="\t") | |
203 } | |
204 | |
205 # Read the required components | |
206 prinComp = strsplit(opt$princomp, ",")[[1]] | |
207 prinComp_i = as.numeric(gsub("PC", "", prinComp)) | |
208 | |
209 # Get a vector with all the variance percentages | |
210 variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2) | |
211 | |
212 if (opt$biplot) { | |
213 | |
214 aggrVar = opt$color_by | |
215 | |
216 # === Centroids === | |
217 | |
218 centroids = aggregate ( | |
219 formula(sprintf(".~%s", aggrVar)), | |
220 df[,c(colnames(df)[grep("^PC", colnames(df))], aggrVar)], | |
221 mean | |
222 ) | |
223 centroidsM = centroids[,-1] | |
224 | |
225 # === Loadings === | |
226 | |
227 vecNorm = function(x) {sqrt(sum(x**2))} | |
228 | |
229 scaledLoadings = sweep(m_pca$rotation, 2, m_pca$sdev * nrow(m_pca$x), "*") | |
230 | |
231 centroidsNorm = apply(centroidsM[,prinComp], 1, vecNorm) # DIM: number of levels x 1 | |
232 loadingsNorm = apply(scaledLoadings[,prinComp], 1, vecNorm) # DIM: number of variables x 1 | |
233 | |
234 cosine = ( scaledLoadings[,prinComp] %*% t(centroidsM[,prinComp]) ) / (loadingsNorm %*% t(centroidsNorm)) | |
235 cosine = setNames(data.frame(cosine), centroids[,1]) | |
236 | |
237 closest = setNames(melt(apply(1-cosine, 2, rank)), c("variable", aggrVar, "rank")) | |
238 write.table( closest, file=sprintf("%s.cosine.tsv", opt$output), quote=F, row.names=F, sep="\t"); | |
239 | |
240 closest_df = data.frame(merge(closest, scaledLoadings, by.x="variable", by.y="row.names")) | |
241 | |
242 | |
243 } | |
244 | |
245 | |
246 | |
247 ############# | |
248 # PLOT | |
249 ############# | |
250 | |
251 # plot parameters | |
252 pts = 5 | |
253 | |
254 l_col = opt$labels | |
255 base_size = opt$base_size | |
256 | |
257 theme_set(theme_bw(base_size = base_size)) | |
258 theme_update(legend.text=element_text(size=0.9*base_size), | |
259 legend.key.size=unit(0.9*base_size, "points"), | |
260 legend.key = element_blank() | |
261 ) | |
262 | |
263 top = 30 | |
264 | |
265 | |
266 # Open device for plotting | |
267 pdf(sprintf("%s.pdf", output_name), w=opt$width, h=opt$height) | |
268 | |
269 if (length(prinComp) == 2){ | |
270 | |
271 geom_params = list() | |
272 geom_params$size = pts | |
273 # geom_params$alpha = opt$alpha | |
274 | |
275 mapping = list() | |
276 mapping <- modifyList(mapping, aes_string(x=prinComp[1], y=prinComp[2])) | |
277 | |
278 if (!is.null(opt$color_by)) { | |
279 gp_color_by=interaction(df[color_by]) | |
280 if (!is.null(opt$sort_color)) { | |
281 gp_color_by = factor(gp_color_by, levels=sort_color) | |
282 } | |
283 mapping = modifyList(mapping, aes_string(color=gp_color_by, order=gp_color_by)) | |
284 } else { | |
285 gp_color_by=NULL | |
286 } | |
287 | |
288 if (!is.null(opt$shape_by)) { | |
289 gp_shape_by=interaction(df[shape_by]) | |
290 if (!is.null(opt$sort_shape)) { | |
291 gp_shape_by = factor(gp_shape_by, levels=sort_shape) | |
292 } | |
293 mapping = modifyList(mapping, aes_string(shape=gp_shape_by, order=gp_shape_by)) | |
294 } else { | |
295 gp_shape_by=NULL | |
296 } | |
297 | |
298 # if (!is.na(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]); | |
299 # gp_shape_by <- factor(gp_shape_by, levels=sort(levels(gp_shape_by))) | |
300 # mapping = modifyList(mapping, aes_string(shape=S_col)) | |
301 | |
302 class(mapping) <- "uneval" | |
303 | |
304 pointLayer <- layer( | |
305 geom = "point", | |
306 # geom_params = geom_params, | |
307 params = geom_params, | |
308 mapping = mapping, | |
309 stat = "identity", | |
310 position = "identity" | |
311 ) | |
312 | |
313 | |
314 | |
315 | |
316 # plotting... | |
317 gp = ggplot(df, aes_string(x=prinComp[1],y=prinComp[2])); | |
318 | |
319 if (opt$biplot) { | |
320 gp = gp + geom_point(data=centroids, aes_string(x=prinComp[1], y=prinComp[2], color=opt$color_by), shape=8, size=7) | |
321 gp = gp + geom_segment( | |
322 data=subset(closest_df, rank <= top), | |
323 aes_string(x=0, y=0, xend=prinComp[1], yend=prinComp[2], color=opt$color_by) | |
324 ) | |
325 } | |
326 | |
327 | |
328 if (opt$border) { | |
329 if (!is.null(opt$shape_by)) { | |
330 gp = gp + geom_point(aes(shape=gp_shape_by), col='black', size=pts+1.0); | |
331 } else { | |
332 gp = gp + geom_point(col="black", size=pts+1.0) | |
333 } | |
334 } | |
335 | |
336 gp = gp + pointLayer | |
337 | |
338 # gp = gp + geom_point(aes(color=gp_color_by)) | |
339 # gp = gp + geom_point(aes(col=gp_color_by, shape=gp_shape_by), size=pts); | |
340 # | |
341 gp = gp + labs(title=""); | |
342 gp = gp + labs(x=sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]])); | |
343 gp = gp + labs(y=sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]])); | |
344 | |
345 gp = gp + scale_color_manual(name=opt$color_by, values=my_palette) | |
346 if (!is.null(opt$shapes)) { | |
347 gp = gp + scale_shape_manual(name=opt$shape_by, values=my_shapes); | |
348 } | |
349 if (opt$no_legend) { | |
350 gp = gp + guides(shape=FALSE, color=FALSE) | |
351 | |
352 } | |
353 if (!is.null(opt$labels)) { | |
354 gp = gp + geom_text(aes_string(label=l_col), size=pts) | |
355 } | |
356 | |
357 gp | |
358 } | |
359 | |
360 | |
361 | |
362 | |
363 # -------------------- | |
364 # | |
365 # 3d scatterplot | |
366 # | |
367 # -------------------- | |
368 | |
369 | |
370 if (length(prinComp) == 3) { | |
371 | |
372 suppressPackageStartupMessages(library(scatterplot3d)) | |
373 | |
374 par(xpd=NA, omi=c(0.5, 0.5, 0.5, 1.0)) | |
375 | |
376 if (!is.na(opt$color_by)) {gp_color=my_palette[interaction(df[color_by])]} else {gp_color="black"} | |
377 if (!is.null(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]); | |
378 gp_shape_by <- factor(gp_shape_by, levels=sort(intersect(levels(gp_shape_by), gp_shape_by))); gp_shape=my_shapes[gp_shape_by]} else {gp_shape_by=NULL} | |
379 | |
380 plot3d = scatterplot3d(df[prinComp], | |
381 color = gp_color, | |
382 pch = gp_shape, | |
383 xlab = sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]]), | |
384 ylab = sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]]), | |
385 zlab = sprintf('%s (%s%%)', prinComp[3], variances[prinComp_i[3]]), | |
386 cex.symbols = 1.5, | |
387 lab = c(5,4) | |
388 ) | |
389 | |
390 # !!! To be removed after the mouse paper !!! | |
391 #i=0; for(sample in interaction(df[color_by])) { | |
392 #i=i+1; plot3d$points3d(subset(df, General_category == sample, select=prinComp), type='l', col=gp_color[i])} | |
393 | |
394 if (!is.na(opt$color_by)) { | |
395 legend( | |
396 x = log(max(df[prinComp[1]])) + 3, | |
397 # x = 5, | |
398 y = 5.5, | |
399 legend = levels(interaction(df[color_by])), | |
400 fill = my_palette[seq_along(levels(interaction(df[color_by])))] | |
401 ) | |
402 } | |
403 | |
404 if (!is.na(opt$shape_by)) { | |
405 legend( | |
406 # x = -log(abs(min(df[prinComp[1]]))) - 1.5, | |
407 x = -3, | |
408 y = 6, | |
409 # y = 7.2, | |
410 legend = levels(gp_shape_by), | |
411 pch = my_shapes[seq_along(levels(gp_shape_by))] | |
412 ) | |
413 # legend(-log(abs(min(df[prinComp[1]])))+1.5,7.2,levels(gp_shape_by), | |
414 # pch=shapes[seq_along(levels(gp_shape_by))]) | |
415 } | |
416 } | |
417 | |
418 | |
419 dev.off() | |
420 q(save='no') | |
421 |