comparison MClust_all.R @ 1:f3fa21cda5f5 draft default tip

planemo upload commit 13e72e84c523bda22bda792bbebf4720d28542d5-dirty
author labis-app
date Fri, 10 Aug 2018 15:45:44 -0400
parents
children
comparison
equal deleted inserted replaced
0:ba070efb6f78 1:f3fa21cda5f5
1 ###
2 # title: ""
3 # author: "Diego M. Riano-Pachon, Rodrigo R. Dorado Goitia"
4 # date: ""
5 # output: pdf
6 ###
7
8 ###
9 # Description
10 ###
11
12 ### Installers ###
13 # install.packages("qvalue")
14 # install.packages("ggplot2")
15 # install.packages("mclust")
16 # install.packages("reshape2")
17 # install.packages('getopt')
18
19 rm(list=ls())
20
21 #Load required packages
22 suppressMessages(library(qvalue))
23 suppressMessages(library(ggplot2))
24 suppressMessages(library(mclust))
25 suppressMessages(library(reshape2))
26 suppressMessages(library(getopt))
27
28 ###
29
30 # Set Functions
31
32 # Set The directory to start the execution and read a csv file
33 # directoryName String Name of the directory.
34 # filename String Name of the csv file.
35 # Rodrigo Dorado
36 readCsvDirectory<-function(directoryName, filename, SpaceString) {
37 setwd(directoryName)
38 fileData <- read.csv(filename,
39 sep = "\t",
40 header=TRUE,
41 na.strings = 'NaN',
42 dec=SpaceString,
43 stringsAsFactors=FALSE)
44 return(fileData)
45 }
46
47 # Print the dim, columns name and the first 5 rows of a variable
48 # dataToInform Object Name of the varaible.
49 # Rodrigo Dorado
50 printInfo <- function(dataToInform) {
51 print('Length')
52 print(length(dataToInform))
53 print('---------------')
54 print('Dim')
55 print(dim(dataToInform))
56 print('***************')
57 print('Column Names')
58 print(colnames(dataToInform))
59 print('_______________')
60 print('Firsts rows')
61 print(head(dataToInform))
62 print('_______________')
63 }
64
65 ## Put the id of protein as row name, get only the data of the experiments
66 ## proteins matrix All the proteins data
67 ## column_id int The number of the column where the is the id
68 ## init_column int The number of the colmun of the initial experiment
69 ## number_exp int The number of experiments
70 ## Rodrigo Dorado
71 getDataofInfo <- function(proteins, column_id, init_column, number_exp) {
72 selected_columns <- proteins[1:nrow(proteins), c(init_column:number_exp)]
73 id_proteins_column <- proteins[1:nrow(proteins), column_id]
74 ids <- matrix(, ncol = 1, nrow = nrow(proteins))
75 j <- 1
76 for(i in id_proteins_column){
77 id_proteins_aux <- strsplit(i, ";")
78 unlist(id_proteins_aux)
79 ids[j] <- id_proteins_aux[[1]][1]
80 j <- j + 1
81 }
82 row.names(selected_columns) <- ids
83 return(selected_columns)
84 }
85
86 ## Verify if the protein contain at least one group with all the values different to NaN
87 ## proteins matrix All the proteins data
88 ## orderCondition int The division per experiment
89 ## Rodrigo Dorado
90 VarifyAComplteGroup <- function(proteins, orderCondition) {
91 protein_erase <- c()
92 col_names <- colnames(proteins)
93 ## Get every protein
94 count <- 0
95 for (protein in 1:nrow(proteins)) {
96 ok_protein <- FALSE
97 num_div <- ncol(orderCondition)
98 columnNumber <- 1
99 ## Move by a experiment
100 for (result_exp in seq(1, nrow(orderCondition))) {
101 ok_group <- TRUE
102 pos_NaN <- c()
103 pos_valid <- c()
104 ## Get the name of the experiment
105 column_name <- row.names(orderCondition)[result_exp]
106 ## Move into the divisions
107 for (div_pos in 1:num_div) {
108 if (is.na(orderCondition[result_exp, div_pos])){
109 break
110 }
111 pos <- columnNumber
112 columnNumber <- columnNumber + 1
113 ## pos <- result_exp + (div_pos - 1)
114 ## check if is NaN
115 if(is.na(proteins[protein, pos])){
116 ok_group <- FALSE
117 pos_NaN <- c(pos_NaN, pos)
118 } else {
119 pos_valid <- c(pos_valid, pos)
120 }
121 }
122 ##Check if in the group, if the three are valid
123 if (ok_group) {
124 ok_protein <- TRUE
125 }
126 switch_var <- length(pos_NaN)
127 allPos <- c(pos_valid, pos_NaN)
128 ## Execute conditions
129
130 if(length(pos_valid) == 0){
131 proteins[protein, pos_NaN] <- 0
132 }else if(length(pos_NaN) > 0){
133 proteins[protein, pos_NaN] <- getTheMedian(proteins, protein, pos_valid)
134 }
135 resultValue <- getTheMedian(proteins, protein, allPos)
136 ##if (switch_var == 0) {
137 ##resultValue <- getTheMedian(proteins, protein, allPos)
138 ##caso1 <- caso1 + 1
139 ##}
140 ##if (switch_var == 1) {
141 ##proteins[protein, pos_NaN] <- mean(as.numeric(proteins[protein, pos_valid]))
142 ##resultValue <- getTheMedian(proteins, protein, allPos)
143 ##caso2 <- caso2 + 1
144 ##}
145 ##if (switch_var == 2) {
146 ##proteins[protein, pos_NaN] <- 0
147 #### resultValue <- 0
148 ##resultValue <- proteins[protein, ]
149 ##caso3 <- caso3 + 1
150 ##}
151 ##if (switch_var == 3) {
152 ##proteins[protein, pos_NaN] <- 0
153 ##resultValue <- 0
154 ##caso4 <- caso4 + 1
155 ##}
156 ## Get the unique value
157 proteins[protein, column_name] <- resultValue
158 }
159 ## Check if the protein conatin at least one group valid
160 if (!ok_protein) {
161 protein_erase <- c(protein_erase, protein)
162 }
163 }
164 ## return the list
165 if(length(protein_erase) > 0) {
166 return(proteins[-protein_erase,])
167 }else{
168 return(proteins)
169 }
170 }
171
172 ## Get the median of a group
173 ## proteins matrix All the proteins
174 ## protein int The actual protein
175 ## positions int The number of experiment
176 ## Rodrigo Dorado
177 getTheMedian <- function(proteins, protein, positions) {
178 return(median(as.numeric(proteins[protein, positions])))
179 }
180
181 ## Merge two data frames by the id
182 ## proteins Data Frame The first data frame
183 ## p_values Data Frame The second data frame
184 mergeById <- function(proteins, p_values) {
185 proteins$auxiliar <- row.names(proteins)
186 p_values$auxiliar <- row.names(p_values)
187 proteins <- merge(proteins, p_values, by.x = 'auxiliar', by.y = 'auxiliar')
188 row.names(proteins) <- proteins[, 'auxiliar']
189 return(proteins[, -1])
190 }
191
192 ## Get the p values of the ANOVA
193 ## proteins Data Frame All the data of the experiments
194 ## Rodrigo Dorado
195
196 ##p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type)
197
198 calculateANOVA <- function(proteins, orderCondition, type = "ONE-WAY") { #"ONE-WAY", "TOW_WAY"
199 all_p_values <- c()
200 for (i in 1:nrow(proteins)) {
201 groupNames <- c()
202 for (j in colnames(proteins)) {
203 groupNames <- c(groupNames, row.names(which(orderCondition == j, arr.ind = TRUE)))
204 }
205 MToANOVA <- data.frame(groupNames, as.numeric(t(proteins[i, ])))
206 colnames(MToANOVA) <- c('GROUP', 'DATA')
207 aov1 <- aov(DATA ~ GROUP, data = MToANOVA)
208 all_p_values <- c(all_p_values, summary(aov1)[[1]][["Pr(>F)"]][1])
209 }
210 p_values <- data.frame(all_p_values)
211 row.names(p_values) <- row.names(proteins)
212 return(p_values)
213 }
214
215 ## Select the data that has less than 0.05 the p_value; Replace NA by 0
216 ## selected_columns_proteins Matrix the selected columns of the data
217 ## Rodrigo Dorado
218 selectSignificanProteinsExpression <- function(selected_columns_proteins) {
219 protein_Qvalues <- qvalue(selected_columns_proteins$all_p_values, lambda=0)
220 selected_columns_proteins$QValue <- protein_Qvalues$qvalues
221 selectedSignificantExpr <- selected_columns_proteins[which(selected_columns_proteins$QValue < 0.05), c(1:10)]
222 all_row_names <- row.names(selectedSignificantExpr)
223 selectedSignificantExpr <- as.data.frame(sapply(selectedSignificantExpr, as.numeric))
224 row.names(selectedSignificantExpr) <- all_row_names
225 ##selectedSignificantExpr[is.na(selectedSignificantExpr)] <- 0
226 return(selectedSignificantExpr)
227 }
228
229 ## Get Z Values of the proteins Expression
230 ## selectedSignificantExpr Matrix The values of the selected proteins
231 ## Rodrigo Dorado
232 getZValueSgnificantProteinExpression <- function(selectedSignificantExpr) {
233 return(as.data.frame(t(scale(t(selectedSignificantExpr)))))
234 }
235
236 ## Execute cluster function, save the result in a csv file. Note: add the condition of max clusters
237 ## ZValues Matrix The z values to cluster
238 ## numOfClusters int The max number of clusters
239 ## filename String The name of the csv fil
240 ## Rodrigo Dorado
241 executeMClust <- function(ZValues, directoryCSV, filename, numOfClusters = 60) {
242 print("Clustering")
243 print(numOfClusters)
244 clusterResult <- Mclust(ZValues, G = 2:numOfClusters)
245 print(summary(clusterResult))
246 Num_clusters <- max(clusterResult$classification, na.rm=TRUE)
247 if (Num_clusters >= numOfClusters) {
248 newNumOfClusters <- numOfClusters + 30
249 executeMClust(ZValues, filename, newNumOfClusters)
250 } else {
251 clusters <- data.frame(ProteinID = names(clusterResult$classification), Cluster = clusterResult$classification, row.names = NULL)
252 table(clusters$Cluster)
253 setwd(directoryCSV)
254 #write.csv(clusters, filename, row.names = FALSE, quote = FALSE)
255 write.table(clusters, file = filename, row.names = FALSE, quote = FALSE, sep = "\t")
256 return(clusters)
257 }
258 }
259
260 ## Prepare data structure
261 ## ZValues Matrix the Z Values of the proteins selected
262 ## clusters Matrix The clusters of the proteins
263 ## Rodrigo Dorado
264 getZvaluesMelt <- function(ZValues, clusters, orderCondition, columnsNameTime) {
265 timesCond <- row.names(orderCondition)
266 ZValues_melt <- ZValues
267 ZValues_melt$ProteinID <- rownames(ZValues_melt)
268 ZValues_melt <- merge(ZValues_melt, clusters, by.x = 'ProteinID', by.y = 'ProteinID')
269 ZValues_melt <- melt(ZValues_melt, id.vars = c('ProteinID','Cluster'))
270 colnames(ZValues_melt) <- c('ProteinID', 'Cluster', 'Time', 'RelativeExpression')
271 ZValues_melt$Cluster <- as.factor(ZValues_melt$Cluster)
272 for(cond in 1:nrow(orderCondition)) {
273 ZValues_melt$Time2[ZValues_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2])
274 }
275 return(ZValues_melt)
276 }
277
278 ## Test the cluster function with diferent sizes
279 ## zvalues data Frame The Z values of the data to get clustered
280 ## number_times array The sizes of the test
281 ## Rodrigo Dorado
282 testCluster <- function (zvalues, number_times) {
283 for (i in number_times) {
284 print("***********")
285 print(i)
286 cluster_result <- Mclust(zvalues, G = 2:i)
287 print(summary(cluster_result))
288 print("-----------")
289 }
290 }
291
292
293 ## Set the color of the background of the image
294 ## Rodrigo dorado
295 setBackGroundColors <- function() {
296 rects <- as.data.frame(matrix(ncol = 3,nrow=2))
297 colnames(rects) <- c('xstart','xend','col')
298 rects[1,] <- c(-49,0,'gray0')
299 rects[2,] <- c(0,73,'white')
300 rects$xend <- as.numeric(rects$xend)
301 rects$xstart <- as.numeric(rects$xstart)
302 return(rects)
303 }
304
305 ## Create new folder.
306 ## mainDir String The directory of the new folder.
307 ## folder String The name of the new folder.
308 createFolder <- function(mainDir, folder) {
309 if (file.exists(folder)){
310 setwd(file.path(mainDir, folder))
311 } else {
312 dir.create(file.path(mainDir, folder))
313 setwd(file.path(mainDir, folder))
314 }
315 }
316
317 # Multiple plot function
318 #
319 # ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
320 # - cols: Number of columns in layout
321 # - layout: A matrix specifying the layout. If present, 'cols' is ignored.
322 #
323 # If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
324 # then plot 1 will go in the upper left, 2 will go in the upper right, and
325 # 3 will go all the way across the bottom.
326 # http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
327 multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
328 library(grid)
329
330 # Make a list from the ... arguments and plotlist
331 plots <- c(list(...), plotlist)
332
333 numPlots = length(plots)
334
335 # If layout is NULL, then use 'cols' to determine layout
336 if (is.null(layout)) {
337 # Make the panel
338 # ncol: Number of columns of plots
339 # nrow: Number of rows needed, calculated from # of cols
340 layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
341 ncol = cols, nrow = ceiling(numPlots/cols))
342 }
343
344 if (numPlots==1) {
345 print(plots[[1]])
346
347 } else {
348 # Set up the page
349 grid.newpage()
350 pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
351
352 # Make each plot, in the correct location
353 for (i in 1:numPlots) {
354 # Get the i,j matrix positions of the regions that contain this subplot
355 matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
356
357 print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
358 layout.pos.col = matchidx$col))
359 }
360 }
361 }
362
363 ## Save all the plots in a pdf file
364 ## name String The name of the file
365 ## data Array All the plots
366 ## Rodrigo Dorado
367 savePlotPDF <- function(name, data) {
368 pdf(name)
369 invisible(lapply(data, print))
370 dev.off()
371 }
372
373 ## Order the conditions per group
374 ## conditions Data Frame All the conditions
375 ## Rodrigo Dorado
376 orderConditionData <- function(conditions) {
377 experimentNames <- unique(conditions[,2])
378 maxSizeId <- tail(names(sort(table(conditions[,2]))), 1)
379 maxSize <- nrow(conditions[conditions[,2] == maxSizeId,])
380 result <- matrix(nrow = length(experimentNames), ncol = maxSize)
381 row.names(result) <- experimentNames
382 colnames(result) <- c(1:maxSize)
383 for(i in 1:length(experimentNames)) {
384 for (j in 1:maxSize) {
385 result[experimentNames[i],j] <- conditions[conditions[,2] == experimentNames[i],][,1][j]
386 }
387 }
388 return(result)
389 }
390
391 ## Get the times or names of the conditions.
392 ## conditions data frame All the text of the control file.
393 ## separaLine Int The line where the flag was founded.
394 ## Rodrigo Dorado
395 getNamesOfConditions <- function(conditions, separateLine) {
396 columnsNameTime <- conditions[(as.numeric(separateLine) + 1):nrow(conditions), ]
397 colnames(columnsNameTime) <- columnsNameTime[1,]
398 columnsNameTime <- columnsNameTime[-1,]
399 row.names(columnsNameTime) <- columnsNameTime[,1]
400 ##columnsNameTime <- columnsNameTime[,-1]
401 return(columnsNameTime)
402 }
403
404 ## Get the directory of a file
405 ## file String File name and directory
406 ## Rodrigo Dorado
407 getDirectoryOfFile <- function(file) {
408 dir <- file
409 allDir <- strsplit(dir, "/")[[1]]
410 allEnters<-length(allDir) - 1
411 directory<-allDir[1]
412 for(i in 2:allEnters) {
413 directory<-paste(directory, allDir[i], sep = "/")
414 }
415 return(list("directory" = directory, "file" = allDir[length(allDir)]) )
416 }
417
418 spec = matrix(c(
419 'filename', 'f', 1, "character",
420 'controlFile', 'c', 1, "character",
421 #'projectName', 'p', 1, "character",
422 'SpaceString', 's', 1, "character",
423 'idColumn', 'i', 1, "integer",
424 'firstExperiment', 'e', 1, "integer",
425 'csvFile', 'v', 1, "character",
426 'pdfFile', 'd', 1, "character"
427 ), byrow=TRUE, ncol=4)
428
429 opt = getopt(spec)
430
431 filename <- opt$filename
432 controlFileName <- opt$controlFile
433 #project_name <- opt$projectName
434 SpaceString <- opt$SpaceString
435 id_column <- opt$idColumn
436 first_experiment <- opt$firstExperiment
437 csvFile <- opt$csvFile
438 pdfFile <- opt$pdfFile
439
440
441 directoryF <- getDirectoryOfFile(filename)
442 directoryC <- getDirectoryOfFile(controlFileName)
443 directoryCSV <- getDirectoryOfFile(csvFile)
444 directoryPDF <- getDirectoryOfFile(pdfFile)
445
446
447 #directory <- "/home/rodrigodorado/MClustDirectory/" ##Directory of the file
448 #filename <- "EXP11_paraR_log.txt" ##File name
449 #project_name <- "june_20_2018" ##Project name
450 #controlFileName <- "EXP11_paraR_log_properties" ##Control File
451 #id_column <- 31 ##Column number of the id of the proteins
452 #first_experiment <- 0 ##Colmn number where the Experiments start
453 #SpaceString <- "."
454 ANOVA_type <- "ONE-WAY"
455 #ANOVA_type <- "TOW_WAY"
456
457 All_proteins <- readCsvDirectory(directoryF$directory, directoryF$file, SpaceString)
458 conditions <- readCsvDirectory(directoryC$directory, directoryC$file, SpaceString)
459 separateLine <- row.names(conditions[conditions[,1] == '-- SEPARATE --',])
460 columnsNameTime <- getNamesOfConditions(conditions, separateLine)
461 conditions <- conditions[1:(as.numeric(separateLine) - 1), ]
462 printInfo(conditions)
463 conditions[,1] <- gsub(" ", SpaceString, conditions[,1])
464 printInfo(conditions)
465 Column_division <- nrow(conditions)
466 orderCondition <- orderConditionData(conditions)
467 printInfo(orderCondition)
468 printInfo(All_proteins)
469 #createFolder(directory, project_name)
470 proteins_complete <- getDataofInfo(All_proteins, id_column, first_experiment, Column_division)
471 printInfo(proteins_complete)
472 proteins_New_column <- VarifyAComplteGroup(proteins_complete, orderCondition)
473 printInfo(proteins_New_column)
474 proteins_for_ANOVA <- proteins_New_column[,1:Column_division]
475 proteins <- proteins_New_column[,(Column_division + 1):ncol(proteins_New_column)]
476 printInfo(proteins_for_ANOVA)
477 printInfo(proteins)
478 #write.csv(proteins_for_ANOVA, "Proteins For Anova.csv",row.names = TRUE, quote = FALSE)
479 #write.csv(proteins, "Proteins With Conditions.csv",row.names = TRUE, quote = FALSE)
480 p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type)
481 printInfo(p_values)
482 proteins <- mergeById(proteins, p_values)
483 printInfo(proteins)
484 selectedSignificantExpr <- selectSignificanProteinsExpression(proteins)
485 printInfo(selectedSignificantExpr)
486 Sig_Pro_Exp_ZValues <- getZValueSgnificantProteinExpression(selectedSignificantExpr)
487 printInfo(Sig_Pro_Exp_ZValues)
488
489 clusters <- executeMClust(Sig_Pro_Exp_ZValues, directoryCSV$directory, directoryCSV$file)#'clusters.csv'
490 printInfo(clusters)
491 Sig_Pro_Exp_ZValues_melt <- getZvaluesMelt(Sig_Pro_Exp_ZValues, clusters, orderCondition, columnsNameTime)
492
493 rects <- setBackGroundColors()
494
495 ##plot the result. note: make function(s)
496 plots_withBackground <- list()
497 plots_withOutBackground <- list()
498 for (clusterNumber in 1:max(clusters$Cluster)){
499 averageProfile <- as.data.frame(matrix(ncol = nrow(orderCondition),nrow=1))
500 timesCond <- row.names(orderCondition)
501 colnames(averageProfile) <- timesCond
502 for (cond in 1:nrow(orderCondition)) {
503 averageProfile[timesCond[cond]] <- mean(selectedSignificantExpr[row.names(selectedSignificantExpr) %in% clusters[which(clusters$Cluster == clusterNumber),"ProteinID"], timesCond[cond]])
504 }
505
506 zval_DEGenes_cluster <- as.data.frame(t(scale(t(averageProfile))))
507 zval_DEGenes_cluster$ClusterNumber <- clusterNumber
508 zval_DEGenes_cluster_melt <- melt(zval_DEGenes_cluster, id.vars="ClusterNumber")
509 zval_DEGenes_cluster_melt$ClusterNumber <- NULL
510 colnames(zval_DEGenes_cluster_melt) <- c("Time","RelativeExpression")
511 zval_DEGenes_cluster_melt$ProteinID <- "Average"
512
513 for(cond in 1:nrow(orderCondition)) {
514 zval_DEGenes_cluster_melt$Time2[zval_DEGenes_cluster_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2])
515 }
516
517 numberGenes = length(unique(clusters[which(clusters$Cluster == clusterNumber),"ProteinID"]))
518
519 outFilePlot1 = paste("cluster1",clusterNumber,"graphWithoutBackground","pdf",sep=".")
520 outFilePlot2 = paste("cluster1",clusterNumber,"graphWithBackground","pdf",sep=".")
521
522 p <- ggplot(Sig_Pro_Exp_ZValues_melt[ which(Sig_Pro_Exp_ZValues_melt$Cluster == clusterNumber),], aes(x = Time2, y = RelativeExpression, group = ProteinID)) +
523 geom_line(col = "lightgray") +
524 geom_line(data = zval_DEGenes_cluster_melt, aes(x = Time2, y = RelativeExpression)) +
525 theme_bw() +
526 xlab("Time point") +
527 ylab("Relative expression") +
528 ggtitle(paste("Cluster 1", clusterNumber, sep = ".")) +
529 annotate("text", x = 3, y = 1, label = paste(numberGenes,"proteins", sep = " "))
530 ##ggsave(filename = outFilePlot1, plot = p)
531 plots_withOutBackground[[clusterNumber]] <- p
532 }
533 setwd(directoryPDF$directory)
534 savePlotPDF(directoryPDF$file, plots_withOutBackground)