Mercurial > repos > labis-app > galaxy_proteomics

diff MClust_all.R @ 1:f3fa21cda5f5 draft default tip
planemo upload commit 13e72e84c523bda22bda792bbebf4720d28542d5-dirty
author: labis-app
date: Fri, 10 Aug 2018 15:45:44 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MClust_all.R	Fri Aug 10 15:45:44 2018 -0400
@@ -0,0 +1,534 @@
+###
+# title: ""
+# author: "Diego M. Riano-Pachon, Rodrigo R. Dorado Goitia"
+# date: ""
+# output: pdf
+###
+
+###
+# Description
+###
+
+### Installers ###
+# install.packages("qvalue")
+# install.packages("ggplot2")
+# install.packages("mclust")
+# install.packages("reshape2")
+# install.packages('getopt')
+
+rm(list=ls())
+
+#Load required packages
+suppressMessages(library(qvalue))
+suppressMessages(library(ggplot2))
+suppressMessages(library(mclust))
+suppressMessages(library(reshape2))
+suppressMessages(library(getopt))
+
+###
+
+# Set Functions
+
+# Set The directory to start the execution and read a csv file
+# directoryName String Name of the directory.
+# filename String Name of the csv file.
+# Rodrigo Dorado
+readCsvDirectory<-function(directoryName, filename, SpaceString) {
+  setwd(directoryName)
+  fileData <- read.csv(filename,
+                       sep = "\t", 
+                       header=TRUE, 
+                       na.strings = 'NaN',
+                       dec=SpaceString,
+                       stringsAsFactors=FALSE)
+  return(fileData)
+}
+
+# Print the dim, columns name and the first 5 rows of a variable
+# dataToInform  Object Name of the varaible.
+# Rodrigo Dorado
+printInfo <- function(dataToInform) {
+  print('Length')
+  print(length(dataToInform))
+  print('---------------')
+  print('Dim')
+  print(dim(dataToInform))
+  print('***************')
+  print('Column Names')
+  print(colnames(dataToInform))
+  print('_______________')
+  print('Firsts rows')
+  print(head(dataToInform))
+  print('_______________')
+}
+
+## Put the id of protein as row name, get only the data of the experiments
+## proteins matrix All the proteins data
+## column_id int The number of the column where the is the id
+## init_column int The number of the colmun of the initial experiment
+## number_exp int The number of experiments
+## Rodrigo Dorado 
+getDataofInfo <- function(proteins, column_id, init_column, number_exp) {
+  selected_columns <- proteins[1:nrow(proteins), c(init_column:number_exp)]
+  id_proteins_column <- proteins[1:nrow(proteins), column_id]
+  ids <- matrix(, ncol = 1, nrow = nrow(proteins))
+  j <- 1
+  for(i in id_proteins_column){
+    id_proteins_aux <- strsplit(i, ";")
+    unlist(id_proteins_aux)
+    ids[j] <- id_proteins_aux[[1]][1]
+    j <- j + 1
+  }
+  row.names(selected_columns) <- ids
+  return(selected_columns)
+}
+
+## Verify if the protein contain at least one group with all the values different to NaN
+## proteins matrix All the proteins data
+## orderCondition int The division per experiment
+## Rodrigo Dorado
+VarifyAComplteGroup <- function(proteins, orderCondition) {
+  protein_erase <- c()
+  col_names <- colnames(proteins)
+  ## Get every protein
+  count <- 0
+  for (protein in 1:nrow(proteins)) {
+    ok_protein <- FALSE
+    num_div <- ncol(orderCondition)
+    columnNumber <- 1
+    ## Move by a experiment
+    for (result_exp in seq(1, nrow(orderCondition))) {
+      ok_group <- TRUE
+      pos_NaN <- c()
+      pos_valid <- c()
+      ## Get the name of the experiment
+      column_name <- row.names(orderCondition)[result_exp]
+      ## Move into the divisions
+      for (div_pos in 1:num_div) {
+        if (is.na(orderCondition[result_exp, div_pos])){
+          break
+        }
+        pos <- columnNumber
+        columnNumber <- columnNumber + 1
+        ## pos <- result_exp + (div_pos - 1)
+        ## check if is NaN
+        if(is.na(proteins[protein, pos])){
+          ok_group <- FALSE
+          pos_NaN <- c(pos_NaN, pos)
+        } else {
+          pos_valid <- c(pos_valid, pos)
+        }
+      }
+      ##Check if in the group, if the three are valid
+      if (ok_group) {
+        ok_protein <- TRUE
+      }
+      switch_var <- length(pos_NaN)
+      allPos <- c(pos_valid, pos_NaN)
+      ## Execute conditions
+      
+      if(length(pos_valid) == 0){
+        proteins[protein, pos_NaN] <- 0
+      }else if(length(pos_NaN) > 0){
+        proteins[protein, pos_NaN] <- getTheMedian(proteins, protein, pos_valid)
+      }
+      resultValue <- getTheMedian(proteins, protein, allPos)
+      ##if (switch_var == 0) {
+      ##resultValue <- getTheMedian(proteins, protein, allPos)
+      ##caso1 <- caso1 + 1
+      ##}
+      ##if (switch_var == 1) {
+      ##proteins[protein, pos_NaN] <- mean(as.numeric(proteins[protein, pos_valid]))
+      ##resultValue <- getTheMedian(proteins, protein, allPos)
+      ##caso2 <- caso2 + 1
+      ##}
+      ##if (switch_var == 2) {
+      ##proteins[protein, pos_NaN] <- 0
+      #### resultValue <- 0
+      ##resultValue <- proteins[protein,   ]
+      ##caso3 <- caso3 + 1
+      ##}
+      ##if (switch_var == 3) {
+      ##proteins[protein, pos_NaN] <- 0
+      ##resultValue <- 0
+      ##caso4 <- caso4 + 1
+      ##}
+      ## Get the unique value
+      proteins[protein, column_name] <- resultValue 
+    }
+    ## Check if the protein conatin at least one group valid
+    if (!ok_protein) {
+      protein_erase <- c(protein_erase, protein)
+    }
+  }
+  ## return the list
+  if(length(protein_erase) > 0) {
+    return(proteins[-protein_erase,])
+  }else{
+    return(proteins)
+  }
+}
+
+## Get the median of a group
+## proteins matrix All the proteins
+## protein int The actual protein
+## positions int The number of experiment
+## Rodrigo Dorado
+getTheMedian <- function(proteins, protein, positions) {
+  return(median(as.numeric(proteins[protein, positions])))
+}
+
+## Merge two data frames by the id
+## proteins Data Frame The first data frame
+## p_values Data Frame The second data frame
+mergeById <- function(proteins, p_values) {
+  proteins$auxiliar <- row.names(proteins)
+  p_values$auxiliar <- row.names(p_values)
+  proteins <- merge(proteins, p_values, by.x = 'auxiliar', by.y = 'auxiliar')
+  row.names(proteins) <- proteins[, 'auxiliar']
+  return(proteins[, -1])
+}
+
+## Get the p values of the ANOVA
+## proteins Data Frame All the data of the experiments
+## Rodrigo Dorado
+
+##p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type)
+
+calculateANOVA <- function(proteins, orderCondition, type = "ONE-WAY") { #"ONE-WAY", "TOW_WAY"
+  all_p_values <- c()
+  for (i in 1:nrow(proteins)) {
+    groupNames <- c()
+    for (j in colnames(proteins)) {
+      groupNames <- c(groupNames, row.names(which(orderCondition == j, arr.ind = TRUE))) 
+    }
+    MToANOVA <- data.frame(groupNames, as.numeric(t(proteins[i, ])))
+    colnames(MToANOVA) <- c('GROUP', 'DATA')
+    aov1 <- aov(DATA ~ GROUP, data = MToANOVA)
+    all_p_values <- c(all_p_values, summary(aov1)[[1]][["Pr(>F)"]][1])
+  }
+  p_values <- data.frame(all_p_values)
+  row.names(p_values) <- row.names(proteins)
+  return(p_values)
+}
+
+## Select the data that has less than 0.05 the p_value; Replace NA by 0
+## selected_columns_proteins Matrix the selected columns of the data
+## Rodrigo Dorado
+selectSignificanProteinsExpression <- function(selected_columns_proteins) {
+  protein_Qvalues <- qvalue(selected_columns_proteins$all_p_values, lambda=0)
+  selected_columns_proteins$QValue <- protein_Qvalues$qvalues
+  selectedSignificantExpr <- selected_columns_proteins[which(selected_columns_proteins$QValue < 0.05), c(1:10)]
+  all_row_names <- row.names(selectedSignificantExpr)
+  selectedSignificantExpr <- as.data.frame(sapply(selectedSignificantExpr, as.numeric))
+  row.names(selectedSignificantExpr) <- all_row_names
+  ##selectedSignificantExpr[is.na(selectedSignificantExpr)] <- 0
+  return(selectedSignificantExpr)
+}
+
+## Get Z Values of the proteins Expression
+## selectedSignificantExpr Matrix The values of the selected proteins
+## Rodrigo Dorado
+getZValueSgnificantProteinExpression <- function(selectedSignificantExpr) {
+  return(as.data.frame(t(scale(t(selectedSignificantExpr)))))
+}
+
+## Execute cluster function, save the result in a csv file. Note: add the condition of max clusters
+## ZValues Matrix The z values to cluster
+## numOfClusters int The max number of clusters
+## filename String The name of the csv fil
+## Rodrigo Dorado
+executeMClust <- function(ZValues, directoryCSV, filename, numOfClusters = 60) {
+  print("Clustering")
+  print(numOfClusters)
+  clusterResult <- Mclust(ZValues, G = 2:numOfClusters)
+  print(summary(clusterResult))
+  Num_clusters <- max(clusterResult$classification, na.rm=TRUE)
+  if (Num_clusters >= numOfClusters) {
+    newNumOfClusters <- numOfClusters + 30
+    executeMClust(ZValues, filename, newNumOfClusters)
+  } else {
+    clusters <- data.frame(ProteinID = names(clusterResult$classification), Cluster = clusterResult$classification, row.names = NULL)
+    table(clusters$Cluster)
+    setwd(directoryCSV)
+    #write.csv(clusters, filename, row.names = FALSE, quote = FALSE)
+    write.table(clusters, file = filename, row.names = FALSE, quote = FALSE, sep = "\t")
+    return(clusters) 
+  }
+}
+
+## Prepare data structure
+## ZValues Matrix the Z Values of the proteins selected
+## clusters Matrix The clusters of the proteins
+## Rodrigo Dorado
+getZvaluesMelt <- function(ZValues, clusters, orderCondition, columnsNameTime) {
+  timesCond <- row.names(orderCondition)
+  ZValues_melt <- ZValues
+  ZValues_melt$ProteinID <- rownames(ZValues_melt)
+  ZValues_melt <- merge(ZValues_melt, clusters, by.x = 'ProteinID', by.y = 'ProteinID')
+  ZValues_melt <- melt(ZValues_melt, id.vars = c('ProteinID','Cluster'))
+  colnames(ZValues_melt) <- c('ProteinID', 'Cluster', 'Time', 'RelativeExpression')
+  ZValues_melt$Cluster <- as.factor(ZValues_melt$Cluster)
+  for(cond in 1:nrow(orderCondition)) {
+    ZValues_melt$Time2[ZValues_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2])
+  }
+  return(ZValues_melt)
+}
+
+## Test the cluster function with diferent sizes
+## zvalues data Frame The Z values of the data to get clustered
+## number_times array The sizes of the test
+## Rodrigo Dorado
+testCluster <- function (zvalues, number_times) {
+  for (i in number_times) {
+    print("***********")
+    print(i)
+    cluster_result <- Mclust(zvalues, G = 2:i)
+    print(summary(cluster_result))
+    print("-----------")
+  }
+}
+
+
+## Set the color of the background of the image
+## Rodrigo dorado
+setBackGroundColors <- function() {
+  rects <- as.data.frame(matrix(ncol = 3,nrow=2))
+  colnames(rects) <- c('xstart','xend','col')
+  rects[1,] <- c(-49,0,'gray0')
+  rects[2,] <- c(0,73,'white')
+  rects$xend <- as.numeric(rects$xend)
+  rects$xstart <- as.numeric(rects$xstart)
+  return(rects)
+}
+
+## Create new folder.
+## mainDir String The directory of the new folder.
+## folder String The name of the new folder.
+createFolder <- function(mainDir, folder) {
+  if (file.exists(folder)){
+    setwd(file.path(mainDir, folder))
+  } else {
+    dir.create(file.path(mainDir, folder))
+    setwd(file.path(mainDir, folder))
+  }
+}
+
+# Multiple plot function
+#
+# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
+# - cols:   Number of columns in layout
+# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
+#
+# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
+# then plot 1 will go in the upper left, 2 will go in the upper right, and
+# 3 will go all the way across the bottom.
+# http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
+multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
+  library(grid)
+  
+  # Make a list from the ... arguments and plotlist
+  plots <- c(list(...), plotlist)
+  
+  numPlots = length(plots)
+  
+  # If layout is NULL, then use 'cols' to determine layout
+  if (is.null(layout)) {
+    # Make the panel
+    # ncol: Number of columns of plots
+    # nrow: Number of rows needed, calculated from # of cols
+    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
+                     ncol = cols, nrow = ceiling(numPlots/cols))
+  }
+  
+  if (numPlots==1) {
+    print(plots[[1]])
+    
+  } else {
+    # Set up the page
+    grid.newpage()
+    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
+    
+    # Make each plot, in the correct location
+    for (i in 1:numPlots) {
+      # Get the i,j matrix positions of the regions that contain this subplot
+      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
+      
+      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
+                                      layout.pos.col = matchidx$col))
+    }
+  }
+}
+
+## Save all the plots in a pdf file
+## name String The name of the file
+## data Array All the plots
+## Rodrigo Dorado
+savePlotPDF <- function(name, data) {
+  pdf(name)
+  invisible(lapply(data, print))
+  dev.off()
+}
+
+## Order the conditions per group
+## conditions Data Frame All the conditions
+## Rodrigo Dorado
+orderConditionData <- function(conditions) {
+  experimentNames <- unique(conditions[,2])
+  maxSizeId <- tail(names(sort(table(conditions[,2]))), 1)
+  maxSize <- nrow(conditions[conditions[,2] == maxSizeId,])
+  result <- matrix(nrow = length(experimentNames), ncol = maxSize)
+  row.names(result) <- experimentNames
+  colnames(result) <- c(1:maxSize)
+  for(i in 1:length(experimentNames)) {
+    for (j in 1:maxSize) {
+      result[experimentNames[i],j] <- conditions[conditions[,2] == experimentNames[i],][,1][j]
+    }
+  }
+  return(result)
+}
+
+## Get the times or names of the conditions.
+## conditions data frame All the text of the control file.
+## separaLine Int The line where the flag was founded.
+## Rodrigo Dorado
+getNamesOfConditions <- function(conditions, separateLine) {
+  columnsNameTime <- conditions[(as.numeric(separateLine) + 1):nrow(conditions), ]
+  colnames(columnsNameTime) <- columnsNameTime[1,]
+  columnsNameTime <- columnsNameTime[-1,]
+  row.names(columnsNameTime) <- columnsNameTime[,1]
+  ##columnsNameTime <- columnsNameTime[,-1]
+  return(columnsNameTime)
+}
+
+## Get the directory of a file
+## file String File name and directory
+## Rodrigo Dorado
+getDirectoryOfFile <- function(file) {
+  dir <- file
+  allDir <- strsplit(dir, "/")[[1]]
+  allEnters<-length(allDir) - 1
+  directory<-allDir[1]
+  for(i in 2:allEnters) {
+    directory<-paste(directory, allDir[i], sep = "/")
+  }
+  return(list("directory" = directory, "file" = allDir[length(allDir)]) )  
+}
+
+spec = matrix(c(
+  'filename', 'f', 1, "character",
+  'controlFile', 'c', 1, "character",
+  #'projectName', 'p', 1, "character",
+  'SpaceString', 's', 1, "character",
+  'idColumn', 'i', 1, "integer",
+  'firstExperiment', 'e', 1, "integer",
+  'csvFile', 'v', 1, "character",
+  'pdfFile', 'd', 1, "character"
+), byrow=TRUE, ncol=4)
+
+opt = getopt(spec) 
+
+filename <- opt$filename
+controlFileName <- opt$controlFile
+#project_name <- opt$projectName
+SpaceString <- opt$SpaceString
+id_column <- opt$idColumn
+first_experiment <- opt$firstExperiment
+csvFile <- opt$csvFile
+pdfFile <- opt$pdfFile
+
+
+directoryF <- getDirectoryOfFile(filename)
+directoryC <- getDirectoryOfFile(controlFileName)
+directoryCSV <- getDirectoryOfFile(csvFile)
+directoryPDF <- getDirectoryOfFile(pdfFile)
+
+
+#directory <- "/home/rodrigodorado/MClustDirectory/" ##Directory of the file
+#filename <- "EXP11_paraR_log.txt" ##File name
+#project_name <- "june_20_2018" ##Project name
+#controlFileName <- "EXP11_paraR_log_properties" ##Control File
+#id_column <- 31 ##Column number of the id of the proteins
+#first_experiment <- 0 ##Colmn number where the Experiments start
+#SpaceString <- "."
+ANOVA_type <- "ONE-WAY"
+#ANOVA_type <- "TOW_WAY"
+
+All_proteins <- readCsvDirectory(directoryF$directory, directoryF$file, SpaceString)
+conditions <- readCsvDirectory(directoryC$directory, directoryC$file, SpaceString)
+separateLine <- row.names(conditions[conditions[,1] == '-- SEPARATE --',])
+columnsNameTime <- getNamesOfConditions(conditions, separateLine)
+conditions <- conditions[1:(as.numeric(separateLine) - 1), ]
+printInfo(conditions)
+conditions[,1] <- gsub(" ", SpaceString, conditions[,1])
+printInfo(conditions)
+Column_division <- nrow(conditions)
+orderCondition <- orderConditionData(conditions)
+printInfo(orderCondition)
+printInfo(All_proteins)
+#createFolder(directory, project_name)
+proteins_complete <- getDataofInfo(All_proteins, id_column, first_experiment, Column_division)
+printInfo(proteins_complete)
+proteins_New_column <- VarifyAComplteGroup(proteins_complete, orderCondition)
+printInfo(proteins_New_column)
+proteins_for_ANOVA <- proteins_New_column[,1:Column_division]
+proteins <- proteins_New_column[,(Column_division + 1):ncol(proteins_New_column)]
+printInfo(proteins_for_ANOVA)
+printInfo(proteins)
+#write.csv(proteins_for_ANOVA, "Proteins For Anova.csv",row.names = TRUE, quote = FALSE)
+#write.csv(proteins, "Proteins With Conditions.csv",row.names = TRUE, quote = FALSE)
+p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type)
+printInfo(p_values)
+proteins <- mergeById(proteins, p_values)
+printInfo(proteins)
+selectedSignificantExpr <- selectSignificanProteinsExpression(proteins)
+printInfo(selectedSignificantExpr)
+Sig_Pro_Exp_ZValues <- getZValueSgnificantProteinExpression(selectedSignificantExpr)
+printInfo(Sig_Pro_Exp_ZValues)
+
+clusters <- executeMClust(Sig_Pro_Exp_ZValues, directoryCSV$directory, directoryCSV$file)#'clusters.csv'
+printInfo(clusters)
+Sig_Pro_Exp_ZValues_melt <- getZvaluesMelt(Sig_Pro_Exp_ZValues, clusters, orderCondition, columnsNameTime)
+
+rects <- setBackGroundColors()
+
+##plot the result. note: make function(s)
+plots_withBackground <- list()
+plots_withOutBackground <- list()
+for (clusterNumber in 1:max(clusters$Cluster)){
+  averageProfile <- as.data.frame(matrix(ncol = nrow(orderCondition),nrow=1))
+  timesCond <- row.names(orderCondition)
+  colnames(averageProfile) <- timesCond
+  for (cond in 1:nrow(orderCondition)) {
+    averageProfile[timesCond[cond]] <- mean(selectedSignificantExpr[row.names(selectedSignificantExpr) %in% clusters[which(clusters$Cluster == clusterNumber),"ProteinID"], timesCond[cond]])
+  }
+  
+  zval_DEGenes_cluster <- as.data.frame(t(scale(t(averageProfile))))
+  zval_DEGenes_cluster$ClusterNumber <- clusterNumber
+  zval_DEGenes_cluster_melt <- melt(zval_DEGenes_cluster, id.vars="ClusterNumber")
+  zval_DEGenes_cluster_melt$ClusterNumber <- NULL
+  colnames(zval_DEGenes_cluster_melt) <- c("Time","RelativeExpression")
+  zval_DEGenes_cluster_melt$ProteinID <- "Average"
+  
+  for(cond in 1:nrow(orderCondition)) {
+    zval_DEGenes_cluster_melt$Time2[zval_DEGenes_cluster_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2])
+  }
+  
+  numberGenes = length(unique(clusters[which(clusters$Cluster == clusterNumber),"ProteinID"]))
+  
+  outFilePlot1 = paste("cluster1",clusterNumber,"graphWithoutBackground","pdf",sep=".")
+  outFilePlot2 = paste("cluster1",clusterNumber,"graphWithBackground","pdf",sep=".")
+  
+  p <- ggplot(Sig_Pro_Exp_ZValues_melt[ which(Sig_Pro_Exp_ZValues_melt$Cluster == clusterNumber),], aes(x = Time2, y = RelativeExpression, group = ProteinID)) + 
+    geom_line(col = "lightgray") + 
+    geom_line(data = zval_DEGenes_cluster_melt, aes(x = Time2, y = RelativeExpression)) +
+    theme_bw() +
+    xlab("Time point") +
+    ylab("Relative expression") +
+    ggtitle(paste("Cluster 1", clusterNumber, sep = ".")) +
+    annotate("text", x = 3, y = 1, label = paste(numberGenes,"proteins", sep = " "))
+  ##ggsave(filename = outFilePlot1, plot = p)
+  plots_withOutBackground[[clusterNumber]] <- p
+}
+setwd(directoryPDF$directory)
+savePlotPDF(directoryPDF$file, plots_withOutBackground)
\ No newline at end of file
author	labis-app
date	Fri, 10 Aug 2018 15:45:44 -0400
parents
children