diff multivariate_wrapper.R @ 0:fafba524dca6 draft

planemo upload for repository https://github.com/workflow4metabolomics/multivariate.git commit 6596dbd39d20ee1962d9ebdd87eec04821239760
author ethevenot
date Wed, 27 Jul 2016 11:22:56 -0400
parents
children da272496b32d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multivariate_wrapper.R	Wed Jul 27 11:22:56 2016 -0400
@@ -0,0 +1,448 @@
+#!/usr/bin/env Rscript
+
+library(batch) ## parseCommandArgs
+
+########
+# MAIN #
+########
+
+argVc <- unlist(parseCommandArgs(evaluate=FALSE))
+
+
+#### Start_of_tested_code  <- function() {}
+
+
+##------------------------------
+## Initializing
+##------------------------------
+
+## options
+##--------
+
+strAsFacL <- options()$stringsAsFactors
+options(stringsAsFactors = FALSE)
+
+## libraries
+##----------
+
+suppressMessages(library(ropls))
+
+if(packageVersion("ropls") < "1.4.0")
+    cat("\nWarning: new version of the 'ropls' package is available\n", sep="")
+
+## constants
+##----------
+
+modNamC <- "Multivariate" ## module name
+
+topEnvC <- environment()
+flgC <- "\n"
+
+## functions
+##----------
+
+flgF <- function(tesC,
+                 envC = topEnvC,
+                 txtC = NA) { ## management of warning and error messages
+
+    tesL <- eval(parse(text = tesC), envir = envC)
+
+    if(!tesL) {
+
+        sink(NULL)
+        stpTxtC <- ifelse(is.na(txtC),
+                          paste0(tesC, " is FALSE"),
+                          txtC)
+
+        stop(stpTxtC,
+             call. = FALSE)
+
+    }
+
+} ## flgF
+
+
+## log file
+##---------
+
+sink(argVc["information"])
+
+cat("\nStart of the '", modNamC, "' Galaxy module call: ",
+    format(Sys.time(), "%a %d %b %Y %X"), "\n", sep="")
+
+
+## arguments
+##----------
+
+xMN <- t(as.matrix(read.table(argVc["dataMatrix_in"],
+                              check.names = FALSE,
+                              header = TRUE,
+                              row.names = 1,
+                              sep = "\t")))
+
+samDF <- read.table(argVc["sampleMetadata_in"],
+                    check.names = FALSE,
+                    header = TRUE,
+                    row.names = 1,
+                    sep = "\t")
+flgF("identical(rownames(xMN), rownames(samDF))", txtC = "Sample names (or number) in the data matrix (first row) and sample metadata (first column) are not identical; use the 'Check Format' module in the 'Quality Control' section")
+
+varDF <- read.table(argVc["variableMetadata_in"],
+                    check.names = FALSE,
+                    header = TRUE,
+                    row.names = 1,
+                    sep = "\t")
+flgF("identical(colnames(xMN), rownames(varDF))", txtC = "Variable names (or number) in the data matrix (first column) and sample metadata (first column) are not identical; use the 'Check Format' module in the 'Quality Control' section")
+
+flgF("argVc['respC'] == 'none' || (argVc['respC'] %in% colnames(samDF))",
+     txtC = paste0("Y Response argument (", argVc['respC'], ") must be either none or one of the column names (first row) of your sample metadata"))
+if(argVc["respC"] != "none") {
+    yMCN <- matrix(samDF[, argVc['respC']], ncol = 1, dimnames = list(rownames(xMN), argVc['respC']))
+} else
+    yMCN <- NULL
+
+if(argVc["testL"] == "TRUE") {
+    flgF("!is.null(yMCN)",
+         txtC = "Predictions cannot be peformed with PCA models")
+    flgF("'test.' %in% colnames(samDF)",
+         txtC = "No 'test.' column found in the sample metadata")
+    flgF("identical(sort(unique(samDF[, 'test.'])), c('no', 'yes'))",
+         txtC = "'test.' column of sample metadata must contain both 'yes' (tested samples) and 'no' (samples to be used for model training) values, and nothing else")
+    flgF("identical(sort(unique(samDF[, 'test.'])), c('no', 'yes'))",
+         txtC = "'test.' column of sample metadata must contain both 'yes' (tested samples) and 'no' (samples to be used for model training) values, and nothing else")
+    flgF("!any(is.na(yMCN[samDF[, 'test.'] == 'no', ]))",
+         txtC = "samples for model training (i.e. 'no' value in the 'test.' column) should not contain NA in the response")
+    tesVl <- samDF[, "test."] == "yes"
+    xTesMN <- xMN[tesVl, , drop = FALSE]
+    xMN <- xMN[!tesVl, , drop = FALSE]
+    yMCN <- yMCN[!tesVl, , drop = FALSE]
+} else
+    tesVl <- NULL
+
+if(!('parAsColC' %in% names(argVc)))
+    argVc["parAsColC"] <- "none"
+flgF("argVc['parAsColC'] == 'none' || argVc['parAsColC'] %in% colnames(samDF)", txtC = paste0("Sample color argument (", argVc['parAsColC'], ") must be either none or one of the column names (first row) of your sample metadata"))
+if(argVc["parAsColC"] != "none") {
+    parAsColFcVn <- samDF[, argVc['parAsColC']]
+    if(is.character(parAsColFcVn))
+        parAsColFcVn <- factor(parAsColFcVn)
+} else
+    parAsColFcVn <- NA
+
+if(!('parMahalC' %in% names(argVc)) || argVc["parMahalC"] == "NA") {
+    if(!is.null(yMCN) && ncol(yMCN) == 1 && mode(yMCN) == "character")
+        argVc["parMahalC"] <- argVc["respC"]
+    else
+        argVc["parMahalC"] <- "none"
+}
+flgF("argVc['parMahalC'] == 'none' || (argVc['parMahalC'] %in% colnames(samDF))",
+     txtC = paste0("Mahalanobis argument (", argVc['parMahalC'], ") must be either 'NA', 'none' or one of the column names (first row) of your sample metadata"))
+if(argVc["parMahalC"] == "none") {
+    parEllipsesL <- FALSE
+} else {
+    if(is.null(yMCN)) { ## PCA case
+        flgF("mode(samDF[, argVc['parMahalC']]) == 'character'",
+             txtC = paste0("Mahalanobis argument (", argVc['parMahalC'], ") must correspond to a column of characters in your sampleMetadata"))
+        parAsColFcVn <- factor(samDF[, argVc["parMahalC"]])
+        parEllipsesL <- TRUE
+    } else { ## (O)PLS-DA case
+        flgF("identical(as.character(argVc['respC']), as.character(argVc['parMahalC']))",
+             txtC = paste0("The Mahalanobis argument (", argVc['parMahalC'], ") must be identical to the Y response argument (", argVc['respC'], ")"))
+        parEllipsesL <- TRUE
+    }
+}
+
+if(!('parLabVc' %in% names(argVc)))
+    argVc["parLabVc"] <- "none"
+flgF("argVc['parLabVc'] == 'none' || (argVc['parLabVc'] %in% colnames(samDF))",
+     txtC = paste0("Sample labels argument (", argVc['parLabVc'], ") must be either none or one of the column names (first row) of your sample metadata"))
+if('parLabVc' %in% names(argVc))
+    if(argVc["parLabVc"] != "none") {
+        flgF("mode(samDF[, argVc['parLabVc']]) == 'character'",
+             txtC = paste0("The sample label argument (", argVc['parLabVc'], ") must correspond to a sample metadata column of characters (not numerics)"))
+        parLabVc <- samDF[, argVc['parLabVc']]
+    } else
+        parLabVc <- NA
+
+if('parPc1I' %in% names(argVc)) {
+    parCompVi <-  as.numeric(c(argVc["parPc1I"], argVc["parPc2I"]))
+} else
+    parCompVi <- c(1, 2)
+
+
+## checking
+##---------
+
+
+flgF("argVc['predI'] == 'NA' || argVc['orthoI'] == 'NA' || as.numeric(argVc['orthoI']) > 0 || parCompVi[2] <=  as.numeric(argVc['predI'])",
+     txtC = paste0("The highest component to display (", parCompVi[2], ") must not exceed the number of predictive components of the model (", argVc['predI'], ")"))
+
+
+if(argVc["orthoI"] == "NA" || argVc["orthoI"] != "0")
+    if(argVc["predI"] == "NA" || argVc["predI"] != "0") {
+        argVc["predI"] <- "1"
+        cat("\nWarning: OPLS: number of predictive components ('predI' argument) set to 1\n", sep = "")
+    }
+
+if(argVc["predI"] != "NA")
+    if(as.numeric(argVc["predI"]) > min(nrow(xMN), ncol(xMN))) {
+        argVc["predI"] <- as.character(min(nrow(xMN), ncol(xMN)))
+        cat("\nWarning: 'predI' set to the minimum of the dataMatrix dimensions: ", as.numeric(argVc["predI"]), "\n", sep = "")
+    }
+
+if("algoC" %in% names(argVc) && argVc["algoC"] == "svd" && length(which(is.na(c(xMN)))) > 0) {
+    minN <- min(c(xMN[!is.na(xMN)])) / 2
+    cat("\nWarning: Missing values set to ", round(minN, 1), " (half minimum value) for 'svd' algorithm to be used\n", sep = "")
+}
+
+
+##------------------------------
+## Computation and plot
+##------------------------------
+
+
+sink()
+
+optWrnN <- options()$warn
+options(warn = -1)
+
+ropLs <- opls(x = xMN,
+              y = yMCN,
+              predI = ifelse(argVc["predI"] == "NA", NA, as.numeric(argVc["predI"])),
+              orthoI = ifelse(argVc["orthoI"] == "NA", NA, as.numeric(argVc["orthoI"])),
+              algoC = ifelse('algoC' %in% names(argVc), argVc["algoC"], "default"),
+              crossvalI = ifelse('crossvalI' %in% names(argVc), as.numeric(argVc["crossvalI"]), 7),
+              log10L = ifelse('log10L' %in% names(argVc), as.logical(argVc["log10L"]), FALSE),
+              permI = ifelse('permI' %in% names(argVc), as.numeric(argVc["permI"]), 20),
+              scaleC = ifelse('scaleC' %in% names(argVc), argVc["scaleC"], "standard"),
+              subset = NULL,
+              printL = FALSE,
+              plotL = FALSE,
+              .sinkC = argVc['information'])
+
+modC <- ropLs@typeC
+sumDF <- getSummaryDF(ropLs)
+desMC <- ropLs@descriptionMC
+scoreMN <- getScoreMN(ropLs)
+loadingMN <- getLoadingMN(ropLs)
+
+vipVn <- coeMN <- orthoScoreMN <- orthoLoadingMN <- orthoVipVn <- NULL
+
+if(grepl("PLS", modC)) {
+
+    vipVn <- getVipVn(ropLs)
+    coeMN <- coef(ropLs)
+
+    if(grepl("OPLS", modC)) {
+        orthoScoreMN <- getScoreMN(ropLs, orthoL = TRUE)
+        orthoLoadingMN <- getLoadingMN(ropLs, orthoL = TRUE)
+        orthoVipVn <- getVipVn(ropLs, orthoL = TRUE)
+    }
+
+}
+
+ploC <- ifelse('typeC' %in% names(argVc), argVc["typeC"], "summary")
+
+if(sumDF[, "pre"] + sumDF[, "ort"] < 2) {
+    if(!(ploC %in% c("permutation", "overview"))) {
+        ploC <- "summary"
+        plotWarnL <- TRUE
+    }
+} else
+    plotWarnL <- FALSE
+
+plot(ropLs,
+     typeVc = ploC,
+     parAsColFcVn = parAsColFcVn,
+     parCexN = ifelse('parCexN' %in% names(argVc), as.numeric(argVc["parCexN"]), 0.8),
+     parCompVi = parCompVi,
+     parEllipsesL = parEllipsesL,
+     parLabVc = parLabVc,
+     file.pdfC = argVc['figure'],
+     .sinkC = argVc['information'])
+
+options(warn = optWrnN)
+
+
+##------------------------------
+## Print
+##------------------------------
+
+
+sink(argVc["information"], append = TRUE)
+
+if(plotWarnL)
+    cat("\nWarning: For single component models, only 'overview' (and 'permutation' in case of single response (O)PLS(-DA)) plot(s) are available\n", sep = "")
+
+
+cat("\n", modC, "\n", sep = "")
+
+cat("\n", desMC["samples", ],
+    " samples x ",
+    desMC["X_variables", ],
+    " variables",
+    ifelse(modC != "PCA",
+           " and 1 response",
+           ""),
+    "\n", sep = "")
+
+cat("\n", ropLs@suppLs[["scaleC"]], " scaling of dataMatrix",
+            ifelse(modC == "PCA",
+                   "",
+                   paste0(" and ",
+                          ifelse(mode(ropLs@suppLs[["yMCN"]]) == "character" && ropLs@suppLs[["scaleC"]] != "standard",
+                                 "standard scaling of ",
+                                 ""),
+                          "response\n")), sep = "")
+
+if(substr(desMC["missing_values", ], 1, 1) != "0")
+    cat("\n", desMC["missing_values", ], " NAs\n", sep = "")
+
+if(substr(desMC["near_zero_excluded_X_variables", ], 1, 1) != "0")
+    cat("\n", desMC["near_zero_excluded_X_variables", ],
+        " excluded variables during model building (because of near zero variance)\n", sep = "")
+
+cat("\n")
+
+optDigN <- options()[["digits"]]
+options(digits = 3)
+print(ropLs@modelDF)
+options(digits = optDigN)
+
+
+##------------------------------
+## Ending
+##------------------------------
+
+
+## Saving
+##-------
+
+
+rspModC <- gsub("-", "", modC)
+if(rspModC != "PCA")
+    rspModC <- paste0(make.names(argVc['respC']), "_", rspModC)
+
+if(sumDF[, "pre"] + sumDF[, "ort"] < 2) {
+
+    tCompMN <- scoreMN
+    pCompMN <- loadingMN
+
+} else {
+
+    if(sumDF[, "ort"] > 0) {
+        if(parCompVi[2] > sumDF[, "ort"] + 1)
+            stop("Selected orthogonal component for plotting (ordinate) exceeds the total number of orthogonal components of the model", call. = FALSE)
+        tCompMN <- cbind(scoreMN[, 1], orthoScoreMN[, parCompVi[2] - 1])
+        pCompMN <- cbind(loadingMN[, 1], orthoLoadingMN[, parCompVi[2] - 1])
+        colnames(pCompMN) <- colnames(tCompMN) <- c("h1", paste("o", parCompVi[2] - 1, sep = ""))
+    } else {
+        if(max(parCompVi) > sumDF[, "pre"])
+            stop("Selected component for plotting as ordinate exceeds the total number of predictive components of the model", call. = FALSE)
+        tCompMN <- scoreMN[, parCompVi, drop = FALSE]
+        pCompMN <- loadingMN[, parCompVi, drop = FALSE]
+    }
+
+}
+
+## x-scores and prediction
+
+colnames(tCompMN) <- paste0(rspModC, "_XSCOR-", colnames(tCompMN))
+tCompDF <- as.data.frame(tCompMN)[rownames(samDF), , drop = FALSE]
+
+if(modC != "PCA") {
+
+    if(!is.null(tesVl)) {
+        tCompFulMN <- matrix(NA,
+                             nrow = nrow(samDF),
+                             ncol = ncol(tCompMN),
+                             dimnames = list(rownames(samDF), colnames(tCompMN)))
+        mode(tCompFulMN) <- "numeric"
+        tCompFulMN[rownames(tCompMN), ] <- tCompMN
+        tCompMN <- tCompFulMN
+
+        fitMCN <- fitted(ropLs)
+        fitFulMCN <- matrix(NA,
+                            nrow = nrow(samDF),
+                            ncol = 1,
+                            dimnames = list(rownames(samDF), NULL))
+        mode(fitFulMCN) <- mode(fitMCN)
+        fitFulMCN[rownames(fitMCN), ] <- fitMCN
+        yPreMCN <- predict(ropLs, newdata = as.data.frame(xTesMN))
+        fitFulMCN[rownames(yPreMCN), ] <- yPreMCN
+        fitMCN <- fitFulMCN
+
+    } else
+        fitMCN <- fitted(ropLs)
+
+    colnames(fitMCN) <- paste0(rspModC,
+                               "_predictions")
+    fitDF <- as.data.frame(fitMCN)[rownames(samDF), , drop = FALSE]
+
+    tCompDF <- cbind.data.frame(tCompDF, fitDF)
+}
+
+samDF <- cbind.data.frame(samDF, tCompDF)
+
+## x-loadings and VIP
+
+colnames(pCompMN) <- paste0(rspModC, "_XLOAD-", colnames(pCompMN))
+if(!is.null(vipVn)) {
+    pCompMN <- cbind(pCompMN, vipVn)
+    colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC,
+                                               "_VIP",
+                                               ifelse(!is.null(orthoVipVn),
+                                                      "_pred",
+                                                      ""))
+    if(!is.null(orthoVipVn)) {
+        pCompMN <- cbind(pCompMN, orthoVipVn)
+        colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC,
+                                                   "_VIP_ortho")
+    }
+}
+if(!is.null(coeMN)) {
+    pCompMN <- cbind(pCompMN, coeMN)
+    colnames(pCompMN)[ncol(pCompMN)] <- paste0(rspModC, "_COEFF")
+}
+pCompDF <- as.data.frame(pCompMN)[rownames(varDF), , drop = FALSE]
+varDF <- cbind.data.frame(varDF, pCompDF)
+
+## sampleMetadata
+
+samDF <- cbind.data.frame(sampleMetadata = rownames(samDF),
+                          samDF)
+write.table(samDF,
+            file = argVc["sampleMetadata_out"],
+            quote = FALSE,
+            row.names = FALSE,
+            sep = "\t")
+
+## variableMetadata
+
+varDF <- cbind.data.frame(variableMetadata = rownames(varDF),
+                          varDF)
+write.table(varDF,
+            file = argVc["variableMetadata_out"],
+            quote = FALSE,
+            row.names = FALSE,
+            sep = "\t")
+
+# Output ropLs
+if ( ! is.null(argVc['ropls_out']))
+	save(ropLs, file = argVc['ropls_out'])
+
+## Closing
+##--------
+
+cat("\nEnd of '", modNamC, "' Galaxy module call: ",
+    as.character(Sys.time()), "\n", sep = "")
+
+sink()
+
+options(stringsAsFactors = strAsFacL)
+
+
+#### End_of_tested_code <- function() {}
+
+
+rm(list = ls())