Mercurial > repos > ethevenot > multivariate
view multivariate_wrapper.R @ 4:5526f8258e8a draft default tip
planemo upload for repository https://github.com/workflow4metabolomics/multivariate.git commit 0f382a5296aae9bfc77df06b0a5ad493eb3c01f3
author | ethevenot |
---|---|
date | Wed, 28 Feb 2018 09:59:25 -0500 |
parents | e91de3b04320 |
children |
line wrap: on
line source
#!/usr/bin/env Rscript library(batch) ## parseCommandArgs # Constants argv <- commandArgs(trailingOnly = FALSE) script.path <- sub("--file=","",argv[grep("--file=",argv)]) prog.name <- basename(script.path) # Print help if (length(grep('-h', argv)) >0) { cat("Usage:", prog.name, "dataMatrix_in myDataMatrix.tsv", "sampleMetadata_in mySampleData.tsv", "variableMetadata_in myVariableMetadata.tsv", "respC ...", "predI ...", "orthoI ...", "testL ...", "typeC ...", "parAsColC ...", "parCexN ...", "parPc1I ...", "parPc2I ...", "parMahalC ...", "parLabVc ...", "algoC ...", "crossvalI ...", "log10L ...", "permI ...", "scaleC ...", "sampleMetadata_out mySampleMetadata_out.tsv", "variableMetadata_out myVariableMetadata_out.tsv", "figure figure.pdf", "information information.txt", "\n") quit(status = 0) } ######## # MAIN # ######## argVc <- unlist(parseCommandArgs(evaluate=FALSE)) ##------------------------------ ## Initializing ##------------------------------ ## options ##-------- strAsFacL <- options()$stringsAsFactors options(stringsAsFactors = FALSE) ## libraries ##---------- suppressMessages(library(ropls)) if(packageVersion("ropls") < "1.4.0") stop("Please use 'ropls' versions of 1.4.0 and above") ## constants ##---------- modNamC <- "Multivariate" ## module name topEnvC <- environment() flgC <- "\n" ## functions ##---------- flgF <- function(tesC, envC = topEnvC, txtC = NA) { ## management of warning and error messages tesL <- eval(parse(text = tesC), envir = envC) if(!tesL) { sink() stpTxtC <- ifelse(is.na(txtC), paste0(tesC, " is FALSE"), txtC) stop(stpTxtC, call. = FALSE) } } ## flgF ## log file ##--------- sink(argVc["information"]) cat("\nStart of the '", modNamC, "' Galaxy module call: ", format(Sys.time(), "%a %d %b %Y %X"), "\n", sep="") ## arguments ##---------- xMN <- t(as.matrix(read.table(argVc["dataMatrix_in"], check.names = FALSE, header = TRUE, row.names = 1, sep = "\t", comment.char = ""))) samDF <- read.table(argVc["sampleMetadata_in"], check.names = FALSE, header = TRUE, row.names = 1, sep = "\t", comment.char = "") flgF("identical(rownames(xMN), rownames(samDF))", txtC = "Sample names (or number) in the data matrix (first row) and sample metadata (first column) are not identical; use the 'Check Format' module in the 'Quality Control' section") varDF <- read.table(argVc["variableMetadata_in"], check.names = FALSE, header = TRUE, row.names = 1, sep = "\t", comment.char = "") flgF("identical(colnames(xMN), rownames(varDF))", txtC = "Variable names (or number) in the data matrix (first column) and sample metadata (first column) are not identical; use the 'Check Format' module in the 'Quality Control' section") flgF("argVc['respC'] == 'none' || (argVc['respC'] %in% colnames(samDF))", txtC = paste0("Y Response argument (", argVc['respC'], ") must be either none or one of the column names (first row) of your sample metadata")) if(argVc["respC"] != "none") { yMCN <- matrix(samDF[, argVc['respC']], ncol = 1, dimnames = list(rownames(xMN), argVc['respC'])) } else yMCN <- NULL if(argVc["testL"] == "TRUE") { flgF("!is.null(yMCN)", txtC = "Predictions cannot be peformed with PCA models") flgF("'test.' %in% colnames(samDF)", txtC = "No 'test.' column found in the sample metadata") flgF("identical(sort(unique(samDF[, 'test.'])), c('no', 'yes'))", txtC = "'test.' column of sample metadata must contain both 'yes' (tested samples) and 'no' (samples to be used for model training) values, and nothing else") flgF("identical(sort(unique(samDF[, 'test.'])), c('no', 'yes'))", txtC = "'test.' column of sample metadata must contain both 'yes' (tested samples) and 'no' (samples to be used for model training) values, and nothing else") flgF("!any(is.na(yMCN[samDF[, 'test.'] == 'no', ]))", txtC = "samples for model training (i.e. 'no' value in the 'test.' column) should not contain NA in the response") tesVl <- samDF[, "test."] == "yes" xTesMN <- xMN[tesVl, , drop = FALSE] xMN <- xMN[!tesVl, , drop = FALSE] yMCN <- yMCN[!tesVl, , drop = FALSE] } else tesVl <- NULL if(!('parAsColC' %in% names(argVc))) argVc["parAsColC"] <- "none" flgF("argVc['parAsColC'] == 'none' || argVc['parAsColC'] %in% colnames(samDF)", txtC = paste0("Sample color argument (", argVc['parAsColC'], ") must be either none or one of the column names (first row) of your sample metadata")) if(argVc["parAsColC"] != "none") { parAsColFcVn <- samDF[, argVc['parAsColC']] if(is.character(parAsColFcVn)) parAsColFcVn <- factor(parAsColFcVn) } else parAsColFcVn <- NA if(!('parMahalC' %in% names(argVc)) || argVc["parMahalC"] == "NA") { if(!is.null(yMCN) && ncol(yMCN) == 1 && mode(yMCN) == "character") argVc["parMahalC"] <- argVc["respC"] else argVc["parMahalC"] <- "none" } flgF("argVc['parMahalC'] == 'none' || (argVc['parMahalC'] %in% colnames(samDF))", txtC = paste0("Mahalanobis argument (", argVc['parMahalC'], ") must be either 'NA', 'none' or one of the column names (first row) of your sample metadata")) if(argVc["parMahalC"] == "none") { parEllipsesL <- FALSE } else { if(is.null(yMCN)) { ## PCA case flgF("mode(samDF[, argVc['parMahalC']]) == 'character'", txtC = paste0("Mahalanobis argument (", argVc['parMahalC'], ") must correspond to a column of characters in your sampleMetadata")) parAsColFcVn <- factor(samDF[, argVc["parMahalC"]]) parEllipsesL <- TRUE } else { ## (O)PLS-DA case flgF("identical(as.character(argVc['respC']), as.character(argVc['parMahalC']))", txtC = paste0("The Mahalanobis argument (", argVc['parMahalC'], ") must be identical to the Y response argument (", argVc['respC'], ")")) parEllipsesL <- TRUE } } if(!('parLabVc' %in% names(argVc))) argVc["parLabVc"] <- "none" flgF("argVc['parLabVc'] == 'none' || (argVc['parLabVc'] %in% colnames(samDF))", txtC = paste0("Sample labels argument (", argVc['parLabVc'], ") must be either none or one of the column names (first row) of your sample metadata")) if('parLabVc' %in% names(argVc)) if(argVc["parLabVc"] != "none") { flgF("mode(samDF[, argVc['parLabVc']]) == 'character'", txtC = paste0("The sample label argument (", argVc['parLabVc'], ") must correspond to a sample metadata column of characters (not numerics)")) parLabVc <- samDF[, argVc['parLabVc']] } else parLabVc <- NA if('parPc1I' %in% names(argVc)) { parCompVi <- as.numeric(c(argVc["parPc1I"], argVc["parPc2I"])) } else parCompVi <- c(1, 2) ## checking ##--------- flgF("argVc['predI'] == 'NA' || argVc['orthoI'] == 'NA' || as.numeric(argVc['orthoI']) > 0 || parCompVi[2] <= as.numeric(argVc['predI'])", txtC = paste0("The highest component to display (", parCompVi[2], ") must not exceed the number of predictive components of the model (", argVc['predI'], ")")) if(argVc["orthoI"] == "NA" || argVc["orthoI"] != "0") if(argVc["predI"] == "NA" || argVc["predI"] != "0") { argVc["predI"] <- "1" cat("\nWarning: OPLS: number of predictive components ('predI' argument) set to 1\n", sep = "") } if(argVc["predI"] != "NA") if(as.numeric(argVc["predI"]) > min(nrow(xMN), ncol(xMN))) { argVc["predI"] <- as.character(min(nrow(xMN), ncol(xMN))) cat("\nWarning: 'predI' set to the minimum of the dataMatrix dimensions: ", as.numeric(argVc["predI"]), "\n", sep = "") } if("algoC" %in% names(argVc) && argVc["algoC"] == "svd" && length(which(is.na(c(xMN)))) > 0) { minN <- min(c(xMN[!is.na(xMN)])) / 2 cat("\nWarning: Missing values set to ", round(minN, 1), " (half minimum value) for 'svd' algorithm to be used\n", sep = "") } ##------------------------------ ## Computation and plot ##------------------------------ sink() optWrnN <- options()$warn options(warn = -1) ropLs <- opls(x = xMN, y = yMCN, predI = ifelse(argVc["predI"] == "NA", NA, as.numeric(argVc["predI"])), orthoI = ifelse(argVc["orthoI"] == "NA", NA, as.numeric(argVc["orthoI"])), algoC = ifelse('algoC' %in% names(argVc), argVc["algoC"], "default"), crossvalI = ifelse('crossvalI' %in% names(argVc), as.numeric(argVc["crossvalI"]), 7), log10L = ifelse('log10L' %in% names(argVc), as.logical(argVc["log10L"]), FALSE), permI = ifelse('permI' %in% names(argVc), as.numeric(argVc["permI"]), 20), scaleC = ifelse('scaleC' %in% names(argVc), argVc["scaleC"], "standard"), subset = NULL, printL = FALSE, plotL = FALSE, .sinkC = argVc['information']) modC <- ropLs@typeC sumDF <- getSummaryDF(ropLs) desMC <- ropLs@descriptionMC scoreMN <- getScoreMN(ropLs) loadingMN <- getLoadingMN(ropLs) vipVn <- coeMN <- orthoScoreMN <- orthoLoadingMN <- orthoVipVn <- NULL if(grepl("PLS", modC)) { vipVn <- getVipVn(ropLs) coeMN <- coef(ropLs) if(grepl("OPLS", modC)) { orthoScoreMN <- getScoreMN(ropLs, orthoL = TRUE) orthoLoadingMN <- getLoadingMN(ropLs, orthoL = TRUE) orthoVipVn <- getVipVn(ropLs, orthoL = TRUE) } } ploC <- ifelse('typeC' %in% names(argVc), argVc["typeC"], "summary") if(sumDF[, "pre"] + sumDF[, "ort"] < 2) { if(!(ploC %in% c("permutation", "overview"))) { ploC <- "summary" plotWarnL <- TRUE } } else plotWarnL <- FALSE plot(ropLs, typeVc = ploC, parAsColFcVn = parAsColFcVn, parCexN = ifelse('parCexN' %in% names(argVc), as.numeric(argVc["parCexN"]), 0.8), parCompVi = parCompVi, parEllipsesL = parEllipsesL, parLabVc = parLabVc, file.pdfC = argVc['figure'], .sinkC = argVc['information']) options(warn = optWrnN) ##------------------------------ ## Print ##------------------------------ sink(argVc["information"], append = TRUE) if(plotWarnL) cat("\nWarning: For single component models, only 'overview' (and 'permutation' in case of single response (O)PLS(-DA)) plot(s) are available\n", sep = "") cat("\n", modC, "\n", sep = "") cat("\n", desMC["samples", ], " samples x ", desMC["X_variables", ], " variables", ifelse(modC != "PCA", " and 1 response", ""), "\n", sep = "") cat("\n", ropLs@suppLs[["scaleC"]], " scaling of dataMatrix", ifelse(modC == "PCA", "", paste0(" and ", ifelse(mode(ropLs@suppLs[["yMCN"]]) == "character" && ropLs@suppLs[["scaleC"]] != "standard", "standard scaling of ", ""), "response\n")), sep = "") if(substr(desMC["missing_values", ], 1, 1) != "0") cat("\n", desMC["missing_values", ], " NAs\n", sep = "") if(substr(desMC["near_zero_excluded_X_variables", ], 1, 1) != "0") cat("\n", desMC["near_zero_excluded_X_variables", ], " excluded variables during model building (because of near zero variance)\n", sep = "") cat("\n") optDigN <- options()[["digits"]] options(digits = 3) print(ropLs@modelDF) options(digits = optDigN) ##------------------------------ ## Ending ##------------------------------ ## Saving ##------- rspModC <- gsub("-", "", modC) if(rspModC != "PCA") rspModC <- paste0(make.names(argVc['respC']), "_", rspModC) if(sumDF[, "pre"] + sumDF[, "ort"] < 2) { tCompMN <- scoreMN pCompMN <- loadingMN } else { if(sumDF[, "ort"] > 0) { if(parCompVi[2] > sumDF[, "ort"] + 1) stop("Selected orthogonal component for plotting (ordinate) exceeds the total number of orthogonal components of the model", call. = FALSE) tCompMN <- cbind(scoreMN[, 1], orthoScoreMN[, parCompVi[2] - 1]) pCompMN <- cbind(loadingMN[, 1], orthoLoadingMN[, parCompVi[2] - 1]) colnames(pCompMN) <- colnames(tCompMN) <- c("h1", paste("o", parCompVi[2] - 1, sep = "")) } else { if(max(parCompVi) > sumDF[, "pre"]) stop("Selected component for plotting as ordinate exceeds the total number of predictive components of the model", call. = FALSE) tCompMN <- scoreMN[, parCompVi, drop = FALSE] pCompMN <- loadingMN[, parCompVi, drop = FALSE] } } ## x-scores and prediction colnames(tCompMN) <- paste0(rspModC, "_XSCOR-", colnames(tCompMN)) tCompDF <- as.data.frame(tCompMN)[rownames(samDF), , drop = FALSE] if(modC != "PCA") { if(!is.null(tesVl)) { tCompFulMN <- matrix(NA, nrow = nrow(samDF), ncol = ncol(tCompMN), dimnames = list(rownames(samDF), colnames(tCompMN))) mode(tCompFulMN) <- "numeric" tCompFulMN[rownames(tCompMN), ] <- tCompMN tCompMN <- tCompFulMN fitMCN <- fitted(ropLs) fitFulMCN <- matrix(NA, nrow = nrow(samDF), ncol = 1, dimnames = list(rownames(samDF), NULL)) mode(fitFulMCN) <- mode(fitMCN) fitFulMCN[rownames(fitMCN), ] <- fitMCN yPreMCN <- predict(ropLs, newdata = as.data.frame(xTesMN)) fitFulMCN[rownames(yPreMCN), ] <- yPreMCN fitMCN <- fitFulMCN } else fitMCN <- fitted(ropLs) colnames(fitMCN) <- paste0(rspModC, "_predictions") fitDF <- as.data.frame(fitMCN)[rownames(samDF), , drop = FALSE] tCompDF <- cbind.data.frame(tCompDF, fitDF) } samDF <- cbind.data.frame(samDF, tCompDF) ## x-loadings and VIP colnames(pCompMN) <- paste0(rspModC, "_XLOAD-", colnames(pCompMN)) if(!is.null(vipVn)) { pCompMN <- cbind(pCompMN, vipVn) colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC, "_VIP", ifelse(!is.null(orthoVipVn), "_pred", "")) if(!is.null(orthoVipVn)) { pCompMN <- cbind(pCompMN, orthoVipVn) colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC, "_VIP_ortho") } } if(!is.null(coeMN)) { pCompMN <- cbind(pCompMN, coeMN) if(ncol(coeMN) == 1) colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC, "_COEFF") else colnames(pCompMN)[(ncol(pCompMN) - ncol(coeMN) + 1):ncol(pCompMN)] <- paste0(rspModC, "_", colnames(coeMN), "-COEFF") } pCompDF <- as.data.frame(pCompMN)[rownames(varDF), , drop = FALSE] varDF <- cbind.data.frame(varDF, pCompDF) ## sampleMetadata samDF <- cbind.data.frame(sampleMetadata = rownames(samDF), samDF) write.table(samDF, file = argVc["sampleMetadata_out"], quote = FALSE, row.names = FALSE, sep = "\t") ## variableMetadata varDF <- cbind.data.frame(variableMetadata = rownames(varDF), varDF) write.table(varDF, file = argVc["variableMetadata_out"], quote = FALSE, row.names = FALSE, sep = "\t") # Output ropLs if (!is.null(argVc['ropls_out']) && !is.na(argVc['ropls_out'])) save(ropLs, file = argVc['ropls_out']) ## Closing ##-------- cat("\nEnd of '", modNamC, "' Galaxy module call: ", as.character(Sys.time()), "\n", sep = "") cat("\n\n\n============================================================================") cat("\nAdditional information about the call:\n") cat("\n1) Parameters:\n") print(cbind(value = argVc)) cat("\n2) Session Info:\n") sessioninfo <- sessionInfo() cat(sessioninfo$R.version$version.string,"\n") cat("Main packages:\n") for (pkg in names(sessioninfo$otherPkgs)) { cat(paste(pkg,packageVersion(pkg)),"\t") }; cat("\n") cat("Other loaded packages:\n") for (pkg in names(sessioninfo$loadedOnly)) { cat(paste(pkg,packageVersion(pkg)),"\t") }; cat("\n") cat("============================================================================\n") sink() options(stringsAsFactors = strAsFacL) rm(list = ls())