Mercurial > repos > ethevenot > multivariate
view multivariate_wrapper.R @ 0:fafba524dca6 draft
planemo upload for repository https://github.com/workflow4metabolomics/multivariate.git commit 6596dbd39d20ee1962d9ebdd87eec04821239760
author | ethevenot |
---|---|
date | Wed, 27 Jul 2016 11:22:56 -0400 |
parents | |
children | da272496b32d |
line wrap: on
line source
#!/usr/bin/env Rscript library(batch) ## parseCommandArgs ######## # MAIN # ######## argVc <- unlist(parseCommandArgs(evaluate=FALSE)) #### Start_of_tested_code <- function() {} ##------------------------------ ## Initializing ##------------------------------ ## options ##-------- strAsFacL <- options()$stringsAsFactors options(stringsAsFactors = FALSE) ## libraries ##---------- suppressMessages(library(ropls)) if(packageVersion("ropls") < "1.4.0") cat("\nWarning: new version of the 'ropls' package is available\n", sep="") ## constants ##---------- modNamC <- "Multivariate" ## module name topEnvC <- environment() flgC <- "\n" ## functions ##---------- flgF <- function(tesC, envC = topEnvC, txtC = NA) { ## management of warning and error messages tesL <- eval(parse(text = tesC), envir = envC) if(!tesL) { sink(NULL) stpTxtC <- ifelse(is.na(txtC), paste0(tesC, " is FALSE"), txtC) stop(stpTxtC, call. = FALSE) } } ## flgF ## log file ##--------- sink(argVc["information"]) cat("\nStart of the '", modNamC, "' Galaxy module call: ", format(Sys.time(), "%a %d %b %Y %X"), "\n", sep="") ## arguments ##---------- xMN <- t(as.matrix(read.table(argVc["dataMatrix_in"], check.names = FALSE, header = TRUE, row.names = 1, sep = "\t"))) samDF <- read.table(argVc["sampleMetadata_in"], check.names = FALSE, header = TRUE, row.names = 1, sep = "\t") flgF("identical(rownames(xMN), rownames(samDF))", txtC = "Sample names (or number) in the data matrix (first row) and sample metadata (first column) are not identical; use the 'Check Format' module in the 'Quality Control' section") varDF <- read.table(argVc["variableMetadata_in"], check.names = FALSE, header = TRUE, row.names = 1, sep = "\t") flgF("identical(colnames(xMN), rownames(varDF))", txtC = "Variable names (or number) in the data matrix (first column) and sample metadata (first column) are not identical; use the 'Check Format' module in the 'Quality Control' section") flgF("argVc['respC'] == 'none' || (argVc['respC'] %in% colnames(samDF))", txtC = paste0("Y Response argument (", argVc['respC'], ") must be either none or one of the column names (first row) of your sample metadata")) if(argVc["respC"] != "none") { yMCN <- matrix(samDF[, argVc['respC']], ncol = 1, dimnames = list(rownames(xMN), argVc['respC'])) } else yMCN <- NULL if(argVc["testL"] == "TRUE") { flgF("!is.null(yMCN)", txtC = "Predictions cannot be peformed with PCA models") flgF("'test.' %in% colnames(samDF)", txtC = "No 'test.' column found in the sample metadata") flgF("identical(sort(unique(samDF[, 'test.'])), c('no', 'yes'))", txtC = "'test.' column of sample metadata must contain both 'yes' (tested samples) and 'no' (samples to be used for model training) values, and nothing else") flgF("identical(sort(unique(samDF[, 'test.'])), c('no', 'yes'))", txtC = "'test.' column of sample metadata must contain both 'yes' (tested samples) and 'no' (samples to be used for model training) values, and nothing else") flgF("!any(is.na(yMCN[samDF[, 'test.'] == 'no', ]))", txtC = "samples for model training (i.e. 'no' value in the 'test.' column) should not contain NA in the response") tesVl <- samDF[, "test."] == "yes" xTesMN <- xMN[tesVl, , drop = FALSE] xMN <- xMN[!tesVl, , drop = FALSE] yMCN <- yMCN[!tesVl, , drop = FALSE] } else tesVl <- NULL if(!('parAsColC' %in% names(argVc))) argVc["parAsColC"] <- "none" flgF("argVc['parAsColC'] == 'none' || argVc['parAsColC'] %in% colnames(samDF)", txtC = paste0("Sample color argument (", argVc['parAsColC'], ") must be either none or one of the column names (first row) of your sample metadata")) if(argVc["parAsColC"] != "none") { parAsColFcVn <- samDF[, argVc['parAsColC']] if(is.character(parAsColFcVn)) parAsColFcVn <- factor(parAsColFcVn) } else parAsColFcVn <- NA if(!('parMahalC' %in% names(argVc)) || argVc["parMahalC"] == "NA") { if(!is.null(yMCN) && ncol(yMCN) == 1 && mode(yMCN) == "character") argVc["parMahalC"] <- argVc["respC"] else argVc["parMahalC"] <- "none" } flgF("argVc['parMahalC'] == 'none' || (argVc['parMahalC'] %in% colnames(samDF))", txtC = paste0("Mahalanobis argument (", argVc['parMahalC'], ") must be either 'NA', 'none' or one of the column names (first row) of your sample metadata")) if(argVc["parMahalC"] == "none") { parEllipsesL <- FALSE } else { if(is.null(yMCN)) { ## PCA case flgF("mode(samDF[, argVc['parMahalC']]) == 'character'", txtC = paste0("Mahalanobis argument (", argVc['parMahalC'], ") must correspond to a column of characters in your sampleMetadata")) parAsColFcVn <- factor(samDF[, argVc["parMahalC"]]) parEllipsesL <- TRUE } else { ## (O)PLS-DA case flgF("identical(as.character(argVc['respC']), as.character(argVc['parMahalC']))", txtC = paste0("The Mahalanobis argument (", argVc['parMahalC'], ") must be identical to the Y response argument (", argVc['respC'], ")")) parEllipsesL <- TRUE } } if(!('parLabVc' %in% names(argVc))) argVc["parLabVc"] <- "none" flgF("argVc['parLabVc'] == 'none' || (argVc['parLabVc'] %in% colnames(samDF))", txtC = paste0("Sample labels argument (", argVc['parLabVc'], ") must be either none or one of the column names (first row) of your sample metadata")) if('parLabVc' %in% names(argVc)) if(argVc["parLabVc"] != "none") { flgF("mode(samDF[, argVc['parLabVc']]) == 'character'", txtC = paste0("The sample label argument (", argVc['parLabVc'], ") must correspond to a sample metadata column of characters (not numerics)")) parLabVc <- samDF[, argVc['parLabVc']] } else parLabVc <- NA if('parPc1I' %in% names(argVc)) { parCompVi <- as.numeric(c(argVc["parPc1I"], argVc["parPc2I"])) } else parCompVi <- c(1, 2) ## checking ##--------- flgF("argVc['predI'] == 'NA' || argVc['orthoI'] == 'NA' || as.numeric(argVc['orthoI']) > 0 || parCompVi[2] <= as.numeric(argVc['predI'])", txtC = paste0("The highest component to display (", parCompVi[2], ") must not exceed the number of predictive components of the model (", argVc['predI'], ")")) if(argVc["orthoI"] == "NA" || argVc["orthoI"] != "0") if(argVc["predI"] == "NA" || argVc["predI"] != "0") { argVc["predI"] <- "1" cat("\nWarning: OPLS: number of predictive components ('predI' argument) set to 1\n", sep = "") } if(argVc["predI"] != "NA") if(as.numeric(argVc["predI"]) > min(nrow(xMN), ncol(xMN))) { argVc["predI"] <- as.character(min(nrow(xMN), ncol(xMN))) cat("\nWarning: 'predI' set to the minimum of the dataMatrix dimensions: ", as.numeric(argVc["predI"]), "\n", sep = "") } if("algoC" %in% names(argVc) && argVc["algoC"] == "svd" && length(which(is.na(c(xMN)))) > 0) { minN <- min(c(xMN[!is.na(xMN)])) / 2 cat("\nWarning: Missing values set to ", round(minN, 1), " (half minimum value) for 'svd' algorithm to be used\n", sep = "") } ##------------------------------ ## Computation and plot ##------------------------------ sink() optWrnN <- options()$warn options(warn = -1) ropLs <- opls(x = xMN, y = yMCN, predI = ifelse(argVc["predI"] == "NA", NA, as.numeric(argVc["predI"])), orthoI = ifelse(argVc["orthoI"] == "NA", NA, as.numeric(argVc["orthoI"])), algoC = ifelse('algoC' %in% names(argVc), argVc["algoC"], "default"), crossvalI = ifelse('crossvalI' %in% names(argVc), as.numeric(argVc["crossvalI"]), 7), log10L = ifelse('log10L' %in% names(argVc), as.logical(argVc["log10L"]), FALSE), permI = ifelse('permI' %in% names(argVc), as.numeric(argVc["permI"]), 20), scaleC = ifelse('scaleC' %in% names(argVc), argVc["scaleC"], "standard"), subset = NULL, printL = FALSE, plotL = FALSE, .sinkC = argVc['information']) modC <- ropLs@typeC sumDF <- getSummaryDF(ropLs) desMC <- ropLs@descriptionMC scoreMN <- getScoreMN(ropLs) loadingMN <- getLoadingMN(ropLs) vipVn <- coeMN <- orthoScoreMN <- orthoLoadingMN <- orthoVipVn <- NULL if(grepl("PLS", modC)) { vipVn <- getVipVn(ropLs) coeMN <- coef(ropLs) if(grepl("OPLS", modC)) { orthoScoreMN <- getScoreMN(ropLs, orthoL = TRUE) orthoLoadingMN <- getLoadingMN(ropLs, orthoL = TRUE) orthoVipVn <- getVipVn(ropLs, orthoL = TRUE) } } ploC <- ifelse('typeC' %in% names(argVc), argVc["typeC"], "summary") if(sumDF[, "pre"] + sumDF[, "ort"] < 2) { if(!(ploC %in% c("permutation", "overview"))) { ploC <- "summary" plotWarnL <- TRUE } } else plotWarnL <- FALSE plot(ropLs, typeVc = ploC, parAsColFcVn = parAsColFcVn, parCexN = ifelse('parCexN' %in% names(argVc), as.numeric(argVc["parCexN"]), 0.8), parCompVi = parCompVi, parEllipsesL = parEllipsesL, parLabVc = parLabVc, file.pdfC = argVc['figure'], .sinkC = argVc['information']) options(warn = optWrnN) ##------------------------------ ## Print ##------------------------------ sink(argVc["information"], append = TRUE) if(plotWarnL) cat("\nWarning: For single component models, only 'overview' (and 'permutation' in case of single response (O)PLS(-DA)) plot(s) are available\n", sep = "") cat("\n", modC, "\n", sep = "") cat("\n", desMC["samples", ], " samples x ", desMC["X_variables", ], " variables", ifelse(modC != "PCA", " and 1 response", ""), "\n", sep = "") cat("\n", ropLs@suppLs[["scaleC"]], " scaling of dataMatrix", ifelse(modC == "PCA", "", paste0(" and ", ifelse(mode(ropLs@suppLs[["yMCN"]]) == "character" && ropLs@suppLs[["scaleC"]] != "standard", "standard scaling of ", ""), "response\n")), sep = "") if(substr(desMC["missing_values", ], 1, 1) != "0") cat("\n", desMC["missing_values", ], " NAs\n", sep = "") if(substr(desMC["near_zero_excluded_X_variables", ], 1, 1) != "0") cat("\n", desMC["near_zero_excluded_X_variables", ], " excluded variables during model building (because of near zero variance)\n", sep = "") cat("\n") optDigN <- options()[["digits"]] options(digits = 3) print(ropLs@modelDF) options(digits = optDigN) ##------------------------------ ## Ending ##------------------------------ ## Saving ##------- rspModC <- gsub("-", "", modC) if(rspModC != "PCA") rspModC <- paste0(make.names(argVc['respC']), "_", rspModC) if(sumDF[, "pre"] + sumDF[, "ort"] < 2) { tCompMN <- scoreMN pCompMN <- loadingMN } else { if(sumDF[, "ort"] > 0) { if(parCompVi[2] > sumDF[, "ort"] + 1) stop("Selected orthogonal component for plotting (ordinate) exceeds the total number of orthogonal components of the model", call. = FALSE) tCompMN <- cbind(scoreMN[, 1], orthoScoreMN[, parCompVi[2] - 1]) pCompMN <- cbind(loadingMN[, 1], orthoLoadingMN[, parCompVi[2] - 1]) colnames(pCompMN) <- colnames(tCompMN) <- c("h1", paste("o", parCompVi[2] - 1, sep = "")) } else { if(max(parCompVi) > sumDF[, "pre"]) stop("Selected component for plotting as ordinate exceeds the total number of predictive components of the model", call. = FALSE) tCompMN <- scoreMN[, parCompVi, drop = FALSE] pCompMN <- loadingMN[, parCompVi, drop = FALSE] } } ## x-scores and prediction colnames(tCompMN) <- paste0(rspModC, "_XSCOR-", colnames(tCompMN)) tCompDF <- as.data.frame(tCompMN)[rownames(samDF), , drop = FALSE] if(modC != "PCA") { if(!is.null(tesVl)) { tCompFulMN <- matrix(NA, nrow = nrow(samDF), ncol = ncol(tCompMN), dimnames = list(rownames(samDF), colnames(tCompMN))) mode(tCompFulMN) <- "numeric" tCompFulMN[rownames(tCompMN), ] <- tCompMN tCompMN <- tCompFulMN fitMCN <- fitted(ropLs) fitFulMCN <- matrix(NA, nrow = nrow(samDF), ncol = 1, dimnames = list(rownames(samDF), NULL)) mode(fitFulMCN) <- mode(fitMCN) fitFulMCN[rownames(fitMCN), ] <- fitMCN yPreMCN <- predict(ropLs, newdata = as.data.frame(xTesMN)) fitFulMCN[rownames(yPreMCN), ] <- yPreMCN fitMCN <- fitFulMCN } else fitMCN <- fitted(ropLs) colnames(fitMCN) <- paste0(rspModC, "_predictions") fitDF <- as.data.frame(fitMCN)[rownames(samDF), , drop = FALSE] tCompDF <- cbind.data.frame(tCompDF, fitDF) } samDF <- cbind.data.frame(samDF, tCompDF) ## x-loadings and VIP colnames(pCompMN) <- paste0(rspModC, "_XLOAD-", colnames(pCompMN)) if(!is.null(vipVn)) { pCompMN <- cbind(pCompMN, vipVn) colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC, "_VIP", ifelse(!is.null(orthoVipVn), "_pred", "")) if(!is.null(orthoVipVn)) { pCompMN <- cbind(pCompMN, orthoVipVn) colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC, "_VIP_ortho") } } if(!is.null(coeMN)) { pCompMN <- cbind(pCompMN, coeMN) colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC, "_COEFF") } pCompDF <- as.data.frame(pCompMN)[rownames(varDF), , drop = FALSE] varDF <- cbind.data.frame(varDF, pCompDF) ## sampleMetadata samDF <- cbind.data.frame(sampleMetadata = rownames(samDF), samDF) write.table(samDF, file = argVc["sampleMetadata_out"], quote = FALSE, row.names = FALSE, sep = "\t") ## variableMetadata varDF <- cbind.data.frame(variableMetadata = rownames(varDF), varDF) write.table(varDF, file = argVc["variableMetadata_out"], quote = FALSE, row.names = FALSE, sep = "\t") # Output ropLs if ( ! is.null(argVc['ropls_out'])) save(ropLs, file = argVc['ropls_out']) ## Closing ##-------- cat("\nEnd of '", modNamC, "' Galaxy module call: ", as.character(Sys.time()), "\n", sep = "") sink() options(stringsAsFactors = strAsFacL) #### End_of_tested_code <- function() {} rm(list = ls())