galaxy_proteomics: MClust_all.R comparison

comparison MClust_all.R @ 1:f3fa21cda5f5 draft default tip

planemo upload commit 13e72e84c523bda22bda792bbebf4720d28542d5-dirty

author	labis-app
date	Fri, 10 Aug 2018 15:45:44 -0400
parents
children

comparison

equal deleted inserted replaced

-:ba070efb6f78
+:f3fa21cda5f5
+###
+# title: ""
+# author: "Diego M. Riano-Pachon, Rodrigo R. Dorado Goitia"
+# date: ""
+# output: pdf
+###
+###
+# Description
+###
+### Installers ###
+# install.packages("qvalue")
+# install.packages("ggplot2")
+# install.packages("mclust")
+# install.packages("reshape2")
+# install.packages('getopt')
+rm(list=ls())
+#Load required packages
+suppressMessages(library(qvalue))
+suppressMessages(library(ggplot2))
+suppressMessages(library(mclust))
+suppressMessages(library(reshape2))
+suppressMessages(library(getopt))
+###
+# Set Functions
+# Set The directory to start the execution and read a csv file
+# directoryName String Name of the directory.
+# filename String Name of the csv file.
+# Rodrigo Dorado
+readCsvDirectory<-function(directoryName, filename, SpaceString) {
+setwd(directoryName)
+fileData <- read.csv(filename,
+sep = "\t",
+header=TRUE,
+na.strings = 'NaN',
+dec=SpaceString,
+stringsAsFactors=FALSE)
+return(fileData)
+}
+# Print the dim, columns name and the first 5 rows of a variable
+# dataToInform  Object Name of the varaible.
+# Rodrigo Dorado
+printInfo <- function(dataToInform) {
+print('Length')
+print(length(dataToInform))
+print('---------------')
+print('Dim')
+print(dim(dataToInform))
+print('***************')
+print('Column Names')
+print(colnames(dataToInform))
+print('_______________')
+print('Firsts rows')
+print(head(dataToInform))
+print('_______________')
+}
+## Put the id of protein as row name, get only the data of the experiments
+## proteins matrix All the proteins data
+## column_id int The number of the column where the is the id
+## init_column int The number of the colmun of the initial experiment
+## number_exp int The number of experiments
+## Rodrigo Dorado
+getDataofInfo <- function(proteins, column_id, init_column, number_exp) {
+selected_columns <- proteins[1:nrow(proteins), c(init_column:number_exp)]
+id_proteins_column <- proteins[1:nrow(proteins), column_id]
+ids <- matrix(, ncol = 1, nrow = nrow(proteins))
+j <- 1
+for(i in id_proteins_column){
+id_proteins_aux <- strsplit(i, ";")
+unlist(id_proteins_aux)
+ids[j] <- id_proteins_aux[[1]][1]
+j <- j + 1
+}
+row.names(selected_columns) <- ids
+return(selected_columns)
+}
+## Verify if the protein contain at least one group with all the values different to NaN
+## proteins matrix All the proteins data
+## orderCondition int The division per experiment
+## Rodrigo Dorado
+VarifyAComplteGroup <- function(proteins, orderCondition) {
+protein_erase <- c()
+col_names <- colnames(proteins)
+## Get every protein
+count <- 0
+for (protein in 1:nrow(proteins)) {
+ok_protein <- FALSE
+num_div <- ncol(orderCondition)
+columnNumber <- 1
+## Move by a experiment
+for (result_exp in seq(1, nrow(orderCondition))) {
+ok_group <- TRUE
+pos_NaN <- c()
+pos_valid <- c()
+## Get the name of the experiment
+column_name <- row.names(orderCondition)[result_exp]
+## Move into the divisions
+for (div_pos in 1:num_div) {
+if (is.na(orderCondition[result_exp, div_pos])){
+break
+}
+pos <- columnNumber
+columnNumber <- columnNumber + 1
+## pos <- result_exp + (div_pos - 1)
+## check if is NaN
+if(is.na(proteins[protein, pos])){
+ok_group <- FALSE
+pos_NaN <- c(pos_NaN, pos)
+} else {
+pos_valid <- c(pos_valid, pos)
+}
+}
+##Check if in the group, if the three are valid
+if (ok_group) {
+ok_protein <- TRUE
+}
+switch_var <- length(pos_NaN)
+allPos <- c(pos_valid, pos_NaN)
+## Execute conditions
+if(length(pos_valid) == 0){
+proteins[protein, pos_NaN] <- 0
+}else if(length(pos_NaN) > 0){
+proteins[protein, pos_NaN] <- getTheMedian(proteins, protein, pos_valid)
+}
+resultValue <- getTheMedian(proteins, protein, allPos)
+##if (switch_var == 0) {
+##resultValue <- getTheMedian(proteins, protein, allPos)
+##caso1 <- caso1 + 1
+##}
+##if (switch_var == 1) {
+##proteins[protein, pos_NaN] <- mean(as.numeric(proteins[protein, pos_valid]))
+##resultValue <- getTheMedian(proteins, protein, allPos)
+##caso2 <- caso2 + 1
+##}
+##if (switch_var == 2) {
+##proteins[protein, pos_NaN] <- 0
+#### resultValue <- 0
+##resultValue <- proteins[protein,   ]
+##caso3 <- caso3 + 1
+##}
+##if (switch_var == 3) {
+##proteins[protein, pos_NaN] <- 0
+##resultValue <- 0
+##caso4 <- caso4 + 1
+##}
+## Get the unique value
+proteins[protein, column_name] <- resultValue
+}
+## Check if the protein conatin at least one group valid
+if (!ok_protein) {
+protein_erase <- c(protein_erase, protein)
+}
+}
+## return the list
+if(length(protein_erase) > 0) {
+return(proteins[-protein_erase,])
+}else{
+return(proteins)
+}
+}
+## Get the median of a group
+## proteins matrix All the proteins
+## protein int The actual protein
+## positions int The number of experiment
+## Rodrigo Dorado
+getTheMedian <- function(proteins, protein, positions) {
+return(median(as.numeric(proteins[protein, positions])))
+}
+## Merge two data frames by the id
+## proteins Data Frame The first data frame
+## p_values Data Frame The second data frame
+mergeById <- function(proteins, p_values) {
+proteins$auxiliar <- row.names(proteins)
+p_values$auxiliar <- row.names(p_values)
+proteins <- merge(proteins, p_values, by.x = 'auxiliar', by.y = 'auxiliar')
+row.names(proteins) <- proteins[, 'auxiliar']
+return(proteins[, -1])
+}
+## Get the p values of the ANOVA
+## proteins Data Frame All the data of the experiments
+## Rodrigo Dorado
+##p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type)
+calculateANOVA <- function(proteins, orderCondition, type = "ONE-WAY") { #"ONE-WAY", "TOW_WAY"
+all_p_values <- c()
+for (i in 1:nrow(proteins)) {
+groupNames <- c()
+for (j in colnames(proteins)) {
+groupNames <- c(groupNames, row.names(which(orderCondition == j, arr.ind = TRUE)))
+}
+MToANOVA <- data.frame(groupNames, as.numeric(t(proteins[i, ])))
+colnames(MToANOVA) <- c('GROUP', 'DATA')
+aov1 <- aov(DATA ~ GROUP, data = MToANOVA)
+all_p_values <- c(all_p_values, summary(aov1)[[1]][["Pr(>F)"]][1])
+}
+p_values <- data.frame(all_p_values)
+row.names(p_values) <- row.names(proteins)
+return(p_values)
+}
+## Select the data that has less than 0.05 the p_value; Replace NA by 0
+## selected_columns_proteins Matrix the selected columns of the data
+## Rodrigo Dorado
+selectSignificanProteinsExpression <- function(selected_columns_proteins) {
+protein_Qvalues <- qvalue(selected_columns_proteins$all_p_values, lambda=0)
+selected_columns_proteins$QValue <- protein_Qvalues$qvalues
+selectedSignificantExpr <- selected_columns_proteins[which(selected_columns_proteins$QValue < 0.05), c(1:10)]
+all_row_names <- row.names(selectedSignificantExpr)
+selectedSignificantExpr <- as.data.frame(sapply(selectedSignificantExpr, as.numeric))
+row.names(selectedSignificantExpr) <- all_row_names
+##selectedSignificantExpr[is.na(selectedSignificantExpr)] <- 0
+return(selectedSignificantExpr)
+}
+## Get Z Values of the proteins Expression
+## selectedSignificantExpr Matrix The values of the selected proteins
+## Rodrigo Dorado
+getZValueSgnificantProteinExpression <- function(selectedSignificantExpr) {
+return(as.data.frame(t(scale(t(selectedSignificantExpr)))))
+}
+## Execute cluster function, save the result in a csv file. Note: add the condition of max clusters
+## ZValues Matrix The z values to cluster
+## numOfClusters int The max number of clusters
+## filename String The name of the csv fil
+## Rodrigo Dorado
+executeMClust <- function(ZValues, directoryCSV, filename, numOfClusters = 60) {
+print("Clustering")
+print(numOfClusters)
+clusterResult <- Mclust(ZValues, G = 2:numOfClusters)
+print(summary(clusterResult))
+Num_clusters <- max(clusterResult$classification, na.rm=TRUE)
+if (Num_clusters >= numOfClusters) {
+newNumOfClusters <- numOfClusters + 30
+executeMClust(ZValues, filename, newNumOfClusters)
+} else {
+clusters <- data.frame(ProteinID = names(clusterResult$classification), Cluster = clusterResult$classification, row.names = NULL)
+table(clusters$Cluster)
+setwd(directoryCSV)
+#write.csv(clusters, filename, row.names = FALSE, quote = FALSE)
+write.table(clusters, file = filename, row.names = FALSE, quote = FALSE, sep = "\t")
+return(clusters)
+}
+}
+## Prepare data structure
+## ZValues Matrix the Z Values of the proteins selected
+## clusters Matrix The clusters of the proteins
+## Rodrigo Dorado
+getZvaluesMelt <- function(ZValues, clusters, orderCondition, columnsNameTime) {
+timesCond <- row.names(orderCondition)
+ZValues_melt <- ZValues
+ZValues_melt$ProteinID <- rownames(ZValues_melt)
+ZValues_melt <- merge(ZValues_melt, clusters, by.x = 'ProteinID', by.y = 'ProteinID')
+ZValues_melt <- melt(ZValues_melt, id.vars = c('ProteinID','Cluster'))
+colnames(ZValues_melt) <- c('ProteinID', 'Cluster', 'Time', 'RelativeExpression')
+ZValues_melt$Cluster <- as.factor(ZValues_melt$Cluster)
+for(cond in 1:nrow(orderCondition)) {
+ZValues_melt$Time2[ZValues_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2])
+}
+return(ZValues_melt)
+}
+## Test the cluster function with diferent sizes
+## zvalues data Frame The Z values of the data to get clustered
+## number_times array The sizes of the test
+## Rodrigo Dorado
+testCluster <- function (zvalues, number_times) {
+for (i in number_times) {
+print("***********")
+print(i)
+cluster_result <- Mclust(zvalues, G = 2:i)
+print(summary(cluster_result))
+print("-----------")
+}
+}
+## Set the color of the background of the image
+## Rodrigo dorado
+setBackGroundColors <- function() {
+rects <- as.data.frame(matrix(ncol = 3,nrow=2))
+colnames(rects) <- c('xstart','xend','col')
+rects[1,] <- c(-49,0,'gray0')
+rects[2,] <- c(0,73,'white')
+rects$xend <- as.numeric(rects$xend)
+rects$xstart <- as.numeric(rects$xstart)
+return(rects)
+}
+## Create new folder.
+## mainDir String The directory of the new folder.
+## folder String The name of the new folder.
+createFolder <- function(mainDir, folder) {
+if (file.exists(folder)){
+setwd(file.path(mainDir, folder))
+} else {
+dir.create(file.path(mainDir, folder))
+setwd(file.path(mainDir, folder))
+}
+}
+# Multiple plot function
+#
+# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
+# - cols:   Number of columns in layout
+# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
+#
+# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
+# then plot 1 will go in the upper left, 2 will go in the upper right, and
+# 3 will go all the way across the bottom.
+# http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
+multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
+library(grid)
+# Make a list from the ... arguments and plotlist
+plots <- c(list(...), plotlist)
+numPlots = length(plots)
+# If layout is NULL, then use 'cols' to determine layout
+if (is.null(layout)) {
+# Make the panel
+# ncol: Number of columns of plots
+# nrow: Number of rows needed, calculated from # of cols
+layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
+ncol = cols, nrow = ceiling(numPlots/cols))
+}
+if (numPlots==1) {
+print(plots[[1]])
+} else {
+# Set up the page
+grid.newpage()
+pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
+# Make each plot, in the correct location
+for (i in 1:numPlots) {
+# Get the i,j matrix positions of the regions that contain this subplot
+matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
+print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
+layout.pos.col = matchidx$col))
+}
+}
+}
+## Save all the plots in a pdf file
+## name String The name of the file
+## data Array All the plots
+## Rodrigo Dorado
+savePlotPDF <- function(name, data) {
+pdf(name)
+invisible(lapply(data, print))
+dev.off()
+}
+## Order the conditions per group
+## conditions Data Frame All the conditions
+## Rodrigo Dorado
+orderConditionData <- function(conditions) {
+experimentNames <- unique(conditions[,2])
+maxSizeId <- tail(names(sort(table(conditions[,2]))), 1)
+maxSize <- nrow(conditions[conditions[,2] == maxSizeId,])
+result <- matrix(nrow = length(experimentNames), ncol = maxSize)
+row.names(result) <- experimentNames
+colnames(result) <- c(1:maxSize)
+for(i in 1:length(experimentNames)) {
+for (j in 1:maxSize) {
+result[experimentNames[i],j] <- conditions[conditions[,2] == experimentNames[i],][,1][j]
+}
+}
+return(result)
+}
+## Get the times or names of the conditions.
+## conditions data frame All the text of the control file.
+## separaLine Int The line where the flag was founded.
+## Rodrigo Dorado
+getNamesOfConditions <- function(conditions, separateLine) {
+columnsNameTime <- conditions[(as.numeric(separateLine) + 1):nrow(conditions), ]
+colnames(columnsNameTime) <- columnsNameTime[1,]
+columnsNameTime <- columnsNameTime[-1,]
+row.names(columnsNameTime) <- columnsNameTime[,1]
+##columnsNameTime <- columnsNameTime[,-1]
+return(columnsNameTime)
+}
+## Get the directory of a file
+## file String File name and directory
+## Rodrigo Dorado
+getDirectoryOfFile <- function(file) {
+dir <- file
+allDir <- strsplit(dir, "/")[[1]]
+allEnters<-length(allDir) - 1
+directory<-allDir[1]
+for(i in 2:allEnters) {
+directory<-paste(directory, allDir[i], sep = "/")
+}
+return(list("directory" = directory, "file" = allDir[length(allDir)]) )
+}
+spec = matrix(c(
+'filename', 'f', 1, "character",
+'controlFile', 'c', 1, "character",
+#'projectName', 'p', 1, "character",
+'SpaceString', 's', 1, "character",
+'idColumn', 'i', 1, "integer",
+'firstExperiment', 'e', 1, "integer",
+'csvFile', 'v', 1, "character",
+'pdfFile', 'd', 1, "character"
+), byrow=TRUE, ncol=4)
+opt = getopt(spec)
+filename <- opt$filename
+controlFileName <- opt$controlFile
+#project_name <- opt$projectName
+SpaceString <- opt$SpaceString
+id_column <- opt$idColumn
+first_experiment <- opt$firstExperiment
+csvFile <- opt$csvFile
+pdfFile <- opt$pdfFile
+directoryF <- getDirectoryOfFile(filename)
+directoryC <- getDirectoryOfFile(controlFileName)
+directoryCSV <- getDirectoryOfFile(csvFile)
+directoryPDF <- getDirectoryOfFile(pdfFile)
+#directory <- "/home/rodrigodorado/MClustDirectory/" ##Directory of the file
+#filename <- "EXP11_paraR_log.txt" ##File name
+#project_name <- "june_20_2018" ##Project name
+#controlFileName <- "EXP11_paraR_log_properties" ##Control File
+#id_column <- 31 ##Column number of the id of the proteins
+#first_experiment <- 0 ##Colmn number where the Experiments start
+#SpaceString <- "."
+ANOVA_type <- "ONE-WAY"
+#ANOVA_type <- "TOW_WAY"
+All_proteins <- readCsvDirectory(directoryF$directory, directoryF$file, SpaceString)
+conditions <- readCsvDirectory(directoryC$directory, directoryC$file, SpaceString)
+separateLine <- row.names(conditions[conditions[,1] == '-- SEPARATE --',])
+columnsNameTime <- getNamesOfConditions(conditions, separateLine)
+conditions <- conditions[1:(as.numeric(separateLine) - 1), ]
+printInfo(conditions)
+conditions[,1] <- gsub(" ", SpaceString, conditions[,1])
+printInfo(conditions)
+Column_division <- nrow(conditions)
+orderCondition <- orderConditionData(conditions)
+printInfo(orderCondition)
+printInfo(All_proteins)
+#createFolder(directory, project_name)
+proteins_complete <- getDataofInfo(All_proteins, id_column, first_experiment, Column_division)
+printInfo(proteins_complete)
+proteins_New_column <- VarifyAComplteGroup(proteins_complete, orderCondition)
+printInfo(proteins_New_column)
+proteins_for_ANOVA <- proteins_New_column[,1:Column_division]
+proteins <- proteins_New_column[,(Column_division + 1):ncol(proteins_New_column)]
+printInfo(proteins_for_ANOVA)
+printInfo(proteins)
+#write.csv(proteins_for_ANOVA, "Proteins For Anova.csv",row.names = TRUE, quote = FALSE)
+#write.csv(proteins, "Proteins With Conditions.csv",row.names = TRUE, quote = FALSE)
+p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type)
+printInfo(p_values)
+proteins <- mergeById(proteins, p_values)
+printInfo(proteins)
+selectedSignificantExpr <- selectSignificanProteinsExpression(proteins)
+printInfo(selectedSignificantExpr)
+Sig_Pro_Exp_ZValues <- getZValueSgnificantProteinExpression(selectedSignificantExpr)
+printInfo(Sig_Pro_Exp_ZValues)
+clusters <- executeMClust(Sig_Pro_Exp_ZValues, directoryCSV$directory, directoryCSV$file)#'clusters.csv'
+printInfo(clusters)
+Sig_Pro_Exp_ZValues_melt <- getZvaluesMelt(Sig_Pro_Exp_ZValues, clusters, orderCondition, columnsNameTime)
+rects <- setBackGroundColors()
+##plot the result. note: make function(s)
+plots_withBackground <- list()
+plots_withOutBackground <- list()
+for (clusterNumber in 1:max(clusters$Cluster)){
+averageProfile <- as.data.frame(matrix(ncol = nrow(orderCondition),nrow=1))
+timesCond <- row.names(orderCondition)
+colnames(averageProfile) <- timesCond
+for (cond in 1:nrow(orderCondition)) {
+averageProfile[timesCond[cond]] <- mean(selectedSignificantExpr[row.names(selectedSignificantExpr) %in% clusters[which(clusters$Cluster == clusterNumber),"ProteinID"], timesCond[cond]])
+}
+zval_DEGenes_cluster <- as.data.frame(t(scale(t(averageProfile))))
+zval_DEGenes_cluster$ClusterNumber <- clusterNumber
+zval_DEGenes_cluster_melt <- melt(zval_DEGenes_cluster, id.vars="ClusterNumber")
+zval_DEGenes_cluster_melt$ClusterNumber <- NULL
+colnames(zval_DEGenes_cluster_melt) <- c("Time","RelativeExpression")
+zval_DEGenes_cluster_melt$ProteinID <- "Average"
+for(cond in 1:nrow(orderCondition)) {
+zval_DEGenes_cluster_melt$Time2[zval_DEGenes_cluster_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2])
+}
+numberGenes = length(unique(clusters[which(clusters$Cluster == clusterNumber),"ProteinID"]))
+outFilePlot1 = paste("cluster1",clusterNumber,"graphWithoutBackground","pdf",sep=".")
+outFilePlot2 = paste("cluster1",clusterNumber,"graphWithBackground","pdf",sep=".")
+p <- ggplot(Sig_Pro_Exp_ZValues_melt[ which(Sig_Pro_Exp_ZValues_melt$Cluster == clusterNumber),], aes(x = Time2, y = RelativeExpression, group = ProteinID)) +
+geom_line(col = "lightgray") +
+geom_line(data = zval_DEGenes_cluster_melt, aes(x = Time2, y = RelativeExpression)) +
+theme_bw() +
+xlab("Time point") +
+ylab("Relative expression") +
+ggtitle(paste("Cluster 1", clusterNumber, sep = ".")) +
+annotate("text", x = 3, y = 1, label = paste(numberGenes,"proteins", sep = " "))
+##ggsave(filename = outFilePlot1, plot = p)
+plots_withOutBackground[[clusterNumber]] <- p
+}
+setwd(directoryPDF$directory)
+savePlotPDF(directoryPDF$file, plots_withOutBackground)

Mercurial > repos > labis-app > galaxy_proteomics

comparison MClust_all.R @ 1:f3fa21cda5f5 draft default tip