comparison plot_pca.R @ 0:610e86c430a9 draft default tip

planemo upload commit 0b661bcf940c03e11becd42b3321df9573b591b2
author vmarcon
date Mon, 23 Oct 2017 09:34:56 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:610e86c430a9
1 #!/usr/bin/env Rscript
2
3 # DEFAULT OPTIONS
4
5 opt = list()
6 opt$log10 = FALSE
7 opt$pseudocount = 1e-04
8 opt$row_as_variables = FALSE
9
10 suppressPackageStartupMessages(library("optparse"))
11
12 options(stringsAsFactors=F)
13
14 ##################
15 # OPTION PARSING
16 ##################
17
18 option_list <- list(
19 make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze. Can be stdin"),
20 make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
21 make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=1e-04),
22 make_option(c("-m", "--metadata"), help="A list of tsv files with metadata on matrix experiment.\n\t\tThey must be in the format 'file1.tsv,file2.tsv' and contain a key column named 'labExpId'. Can be omitted"),
23
24 make_option(c("--merge_mdata_on"), default="labExpId",
25 help="[default=%default]"),
26
27 #make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"),
28 make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'),
29
30 make_option(c("--sort_color"), type='character',
31 help="A field for sorting colors. Can be omitted [default=%default]"),
32
33 make_option(c("-s", "--shape_by"), default=NULL, type="character", help="choose the fields in the metadata you want to shape by"),
34
35 make_option(c("--no_legend"), action="store_true", default=FALSE,
36 help="Do not show the legend [default=%default]"),
37
38 make_option(c("-r", "--row_as_variables"), action="store_true", help="select this if you want rows as variables [default=%default]", default=FALSE),
39 make_option(c("-C", "--princomp"), help="choose the principal components you want to plot. With 3 PC it gives a 3d plot [default='PC1,PC2']", default="PC1,PC2"),
40
41 make_option(c("--print_scores"), action="store_true", default=FALSE,
42 help="Output the resuling PCs as a separate file with the extension PCs.tsv [default=%default]"),
43
44 make_option(c("--print_loadings"), action="store_true", default=FALSE,
45 help="Output the resulting loadings as a separate file with the extension loadings.tsv [default=%default]"),
46
47 make_option(c("--print_lambdas"), action="store_true", default=FALSE,
48 help="Output the resulting lambdas (stdev) as a separate file with the extension lambdas.tsv [default=%default]"),
49
50 make_option(c("--biplot"), default=FALSE, action="store_true",
51 help="If active, the factor of the color is used as grouping factor.
52 Centroids are computed and the first <top> loadings are plotted wrt to the two specified components [default=%default]"),
53
54 make_option(c("--palette"), default="/users/rg/abreschi/R/palettes/cbbPalette1.15.txt",
55 help="File with the color palette [default=%default]"),
56
57 make_option(c("--border"), default=FALSE, action="store_true",
58 help="Black border to dots [default=%default]"),
59
60 make_option(c("--shapes"),
61 help="File with the shapes [default=%default]"),
62
63 make_option(c("-L", "--labels"), default=NULL, type="character",
64 help="The metadata field with the labels [default=%default]"),
65
66 make_option(c("-B", "--base_size"), default=16, type='numeric',
67 help="Base font size [default=%default]"),
68
69 make_option(c("-H", "--height"), default=7,
70 help="Height of the plot in inches [default=%default]"),
71
72 make_option(c("-W", "--width"), default=7,
73 help="Width of the plot in inches [default=%default]"),
74
75 make_option(c("-o", "--output"), default="pca.out",
76 help="output file name [default=%default]"),
77
78 make_option(c("-v", "--verbose"), action='store_true', default=FALSE,
79 help="verbose output [default=%default]")
80 )
81
82 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
83 arguments <- parse_args(parser, positional_arguments = TRUE)
84 opt <- arguments$options
85
86 if (opt$verbose) {print(opt)}
87 ##------------
88 ## LIBRARIES
89 ##------------
90 suppressPackageStartupMessages(library("reshape"))
91 suppressPackageStartupMessages(library("ggplot2"))
92 suppressPackageStartupMessages(library("grid"))
93
94
95 ###############
96 # BEGIN
97 ##############
98
99 # read input tables
100 inF = opt$input_matrix; if (opt$input_matrix == "stdin") {inF = file("stdin")}
101 m = read.table(inF, h=T, sep="\t")
102 if (opt$verbose) {
103 cat("Sample of input matrix:\n")
104 print(head(m[,1:10]))
105 }
106
107
108 # Read the color palette
109 my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1
110
111 # Read the color ordering
112 if (is.null(opt$sort_color)) {
113 sort_color=NULL
114 }else{
115 sort_color = strsplit(opt$sort_color, ",")[[1]]
116 }
117
118 # Read the shapes
119 if (!is.null(opt$shapes)) {
120 my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1
121 }
122
123 # remove potential gene id columns
124 char_cols <- which(sapply(m, class) == 'character')
125 if (length(char_cols) == 0) {genes = rownames(m)}
126 if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
127
128 if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)}
129
130 # apply the log if required
131 if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)}
132
133 # apply pca
134 #if (opt$row_as_variable) {
135 #m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE)} else{
136 #m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE)}
137 #Modified by Gaelle Lefort
138 # apply pca
139 if (opt$row_as_variable) {
140 m_pca = prcomp(t(na.omit(m)), center=FALSE, scale.=FALSE)
141 } else {
142 m_pca = prcomp(na.omit(m), center=FALSE, scale.=FALSE)
143 }
144
145
146 # Scale the scores for biplot
147 #scaledScores = sweep(m_pca$x, 2, m_pca$sdev / sqrt(nrow(m_pca$x)), "/")
148 scaledScores = m_pca$x
149
150 if (opt$verbose) {print(dim(na.omit(m)))}
151
152 # HANDLING METADATA
153
154 # add metadata to pca results, they should be input in the command line in the future
155 if (is.null(opt$color_by)) {color_by=NULL
156 }else{color_by = color_by = strsplit(opt$color_by, ",")[[1]]}
157 if (is.null(opt$shape_by)) {shape_by=NULL
158 }else{shape_by = strsplit(opt$shape_by, ",")[[1]]}
159
160 # read metadata, one or more table to be merged on labExpId
161 if (!is.null(opt$metadata)){
162 mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, comment.char="", quote="");
163 if (opt$merge_mdata_on %in% colnames(mdata)) {
164 mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on])
165 }
166
167 if (opt$verbose) {cat('append metadata...')}
168
169 df = merge(as.data.frame(scaledScores),
170 unique(mdata[c(color_by, shape_by, opt$merge_mdata_on, opt$labels)]), by.x='row.names', by.y=opt$merge_mdata_on, all.x=T)
171 if (opt$verbose) {cat("DONE\n")}
172 }else{
173 df = as.data.frame(scaledScores)
174 }
175
176
177 if (opt$verbose) {print(head(df))}
178
179 #########
180 # OUTPUT
181 #########
182
183 output_name = opt$output
184
185 # Print text outputs if required
186
187 # -- principal components --
188 if (opt$print_scores) {
189 write.table(m_pca$x, sprintf("%s.PCs.tsv", output_name), quote=F, sep="\t")
190 }
191
192 # -- loadings --
193 if (opt$print_loadings) {
194 write.table(m_pca$rotation, sprintf("%s.loadings.tsv", output_name), quote=F, sep="\t")
195 }
196
197 # -- lambdas --
198 if (opt$print_lambdas) {
199 perc = round(100*m_pca$sdev/sum(m_pca$sdev), 2)
200 variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2)
201 out_df = data.frame(lambda=m_pca$sdev, perc=perc, var_perc=variances)
202 write.table(out_df, sprintf("%s.lambdas.tsv", output_name), quote=F, sep="\t")
203 }
204
205 # Read the required components
206 prinComp = strsplit(opt$princomp, ",")[[1]]
207 prinComp_i = as.numeric(gsub("PC", "", prinComp))
208
209 # Get a vector with all the variance percentages
210 variances = round(m_pca$sdev^2/sum(m_pca$sdev^2)*100, 2)
211
212 if (opt$biplot) {
213
214 aggrVar = opt$color_by
215
216 # === Centroids ===
217
218 centroids = aggregate (
219 formula(sprintf(".~%s", aggrVar)),
220 df[,c(colnames(df)[grep("^PC", colnames(df))], aggrVar)],
221 mean
222 )
223 centroidsM = centroids[,-1]
224
225 # === Loadings ===
226
227 vecNorm = function(x) {sqrt(sum(x**2))}
228
229 scaledLoadings = sweep(m_pca$rotation, 2, m_pca$sdev * nrow(m_pca$x), "*")
230
231 centroidsNorm = apply(centroidsM[,prinComp], 1, vecNorm) # DIM: number of levels x 1
232 loadingsNorm = apply(scaledLoadings[,prinComp], 1, vecNorm) # DIM: number of variables x 1
233
234 cosine = ( scaledLoadings[,prinComp] %*% t(centroidsM[,prinComp]) ) / (loadingsNorm %*% t(centroidsNorm))
235 cosine = setNames(data.frame(cosine), centroids[,1])
236
237 closest = setNames(melt(apply(1-cosine, 2, rank)), c("variable", aggrVar, "rank"))
238 write.table( closest, file=sprintf("%s.cosine.tsv", opt$output), quote=F, row.names=F, sep="\t");
239
240 closest_df = data.frame(merge(closest, scaledLoadings, by.x="variable", by.y="row.names"))
241
242
243 }
244
245
246
247 #############
248 # PLOT
249 #############
250
251 # plot parameters
252 pts = 5
253
254 l_col = opt$labels
255 base_size = opt$base_size
256
257 theme_set(theme_bw(base_size = base_size))
258 theme_update(legend.text=element_text(size=0.9*base_size),
259 legend.key.size=unit(0.9*base_size, "points"),
260 legend.key = element_blank()
261 )
262
263 top = 30
264
265
266 # Open device for plotting
267 pdf(sprintf("%s.pdf", output_name), w=opt$width, h=opt$height)
268
269 if (length(prinComp) == 2){
270
271 geom_params = list()
272 geom_params$size = pts
273 # geom_params$alpha = opt$alpha
274
275 mapping = list()
276 mapping <- modifyList(mapping, aes_string(x=prinComp[1], y=prinComp[2]))
277
278 if (!is.null(opt$color_by)) {
279 gp_color_by=interaction(df[color_by])
280 if (!is.null(opt$sort_color)) {
281 gp_color_by = factor(gp_color_by, levels=sort_color)
282 }
283 mapping = modifyList(mapping, aes_string(color=gp_color_by, order=gp_color_by))
284 } else {
285 gp_color_by=NULL
286 }
287
288 if (!is.null(opt$shape_by)) {
289 gp_shape_by=interaction(df[shape_by])
290 if (!is.null(opt$sort_shape)) {
291 gp_shape_by = factor(gp_shape_by, levels=sort_shape)
292 }
293 mapping = modifyList(mapping, aes_string(shape=gp_shape_by, order=gp_shape_by))
294 } else {
295 gp_shape_by=NULL
296 }
297
298 # if (!is.na(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]);
299 # gp_shape_by <- factor(gp_shape_by, levels=sort(levels(gp_shape_by)))
300 # mapping = modifyList(mapping, aes_string(shape=S_col))
301
302 class(mapping) <- "uneval"
303
304 pointLayer <- layer(
305 geom = "point",
306 # geom_params = geom_params,
307 params = geom_params,
308 mapping = mapping,
309 stat = "identity",
310 position = "identity"
311 )
312
313
314
315
316 # plotting...
317 gp = ggplot(df, aes_string(x=prinComp[1],y=prinComp[2]));
318
319 if (opt$biplot) {
320 gp = gp + geom_point(data=centroids, aes_string(x=prinComp[1], y=prinComp[2], color=opt$color_by), shape=8, size=7)
321 gp = gp + geom_segment(
322 data=subset(closest_df, rank <= top),
323 aes_string(x=0, y=0, xend=prinComp[1], yend=prinComp[2], color=opt$color_by)
324 )
325 }
326
327
328 if (opt$border) {
329 if (!is.null(opt$shape_by)) {
330 gp = gp + geom_point(aes(shape=gp_shape_by), col='black', size=pts+1.0);
331 } else {
332 gp = gp + geom_point(col="black", size=pts+1.0)
333 }
334 }
335
336 gp = gp + pointLayer
337
338 # gp = gp + geom_point(aes(color=gp_color_by))
339 # gp = gp + geom_point(aes(col=gp_color_by, shape=gp_shape_by), size=pts);
340 #
341 gp = gp + labs(title="");
342 gp = gp + labs(x=sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]]));
343 gp = gp + labs(y=sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]]));
344
345 gp = gp + scale_color_manual(name=opt$color_by, values=my_palette)
346 if (!is.null(opt$shapes)) {
347 gp = gp + scale_shape_manual(name=opt$shape_by, values=my_shapes);
348 }
349 if (opt$no_legend) {
350 gp = gp + guides(shape=FALSE, color=FALSE)
351
352 }
353 if (!is.null(opt$labels)) {
354 gp = gp + geom_text(aes_string(label=l_col), size=pts)
355 }
356
357 gp
358 }
359
360
361
362
363 # --------------------
364 #
365 # 3d scatterplot
366 #
367 # --------------------
368
369
370 if (length(prinComp) == 3) {
371
372 suppressPackageStartupMessages(library(scatterplot3d))
373
374 par(xpd=NA, omi=c(0.5, 0.5, 0.5, 1.0))
375
376 if (!is.na(opt$color_by)) {gp_color=my_palette[interaction(df[color_by])]} else {gp_color="black"}
377 if (!is.null(opt$shape_by)) {gp_shape_by=interaction(df[shape_by]);
378 gp_shape_by <- factor(gp_shape_by, levels=sort(intersect(levels(gp_shape_by), gp_shape_by))); gp_shape=my_shapes[gp_shape_by]} else {gp_shape_by=NULL}
379
380 plot3d = scatterplot3d(df[prinComp],
381 color = gp_color,
382 pch = gp_shape,
383 xlab = sprintf('%s (%s%%)', prinComp[1], variances[prinComp_i[1]]),
384 ylab = sprintf('%s (%s%%)', prinComp[2], variances[prinComp_i[2]]),
385 zlab = sprintf('%s (%s%%)', prinComp[3], variances[prinComp_i[3]]),
386 cex.symbols = 1.5,
387 lab = c(5,4)
388 )
389
390 # !!! To be removed after the mouse paper !!!
391 #i=0; for(sample in interaction(df[color_by])) {
392 #i=i+1; plot3d$points3d(subset(df, General_category == sample, select=prinComp), type='l', col=gp_color[i])}
393
394 if (!is.na(opt$color_by)) {
395 legend(
396 x = log(max(df[prinComp[1]])) + 3,
397 # x = 5,
398 y = 5.5,
399 legend = levels(interaction(df[color_by])),
400 fill = my_palette[seq_along(levels(interaction(df[color_by])))]
401 )
402 }
403
404 if (!is.na(opt$shape_by)) {
405 legend(
406 # x = -log(abs(min(df[prinComp[1]]))) - 1.5,
407 x = -3,
408 y = 6,
409 # y = 7.2,
410 legend = levels(gp_shape_by),
411 pch = my_shapes[seq_along(levels(gp_shape_by))]
412 )
413 # legend(-log(abs(min(df[prinComp[1]])))+1.5,7.2,levels(gp_shape_by),
414 # pch=shapes[seq_along(levels(gp_shape_by))])
415 }
416 }
417
418
419 dev.off()
420 q(save='no')
421