diff checkformat_script.R @ 1:e194eec8e70c draft

planemo upload for repository https://github.com/workflow4metabolomics/checkformat.git commit 8ebfbfa8d9449c9bbfbf569851a30b1e33df0b3f
author ethevenot
date Sat, 06 Aug 2016 11:54:28 -0400
parents
children 80a38d36f946
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/checkformat_script.R	Sat Aug 06 11:54:28 2016 -0400
@@ -0,0 +1,186 @@
+## Etienne Thevenot
+## CEA, MetaboHUB Paris
+## etienne.thevenot@cea.fr
+
+
+## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files
+## and checks the formats
+readAndCheckF <- function(datFilC="dataMatrix.tsv",
+                          samFilC="sampleMetadata.tsv",
+                          varFilC="variableMetadata.tsv") {
+
+    ## options
+
+    optStrAsFacL <- options()[["stringsAsFactors"]]
+    options(stringsAsFactors = FALSE)
+
+    ## checking that the tables have no duplicated row or column names
+
+    for(tabC in c("dat", "sam", "var")) {
+
+        tabNamC <- switch(tabC, dat="dataMatrix", sam="sampleMetadata", var="variableMetadata")
+
+        rowVc <- read.table(eval(parse(text=paste0(tabC, "FilC"))),
+                            check.names = FALSE,
+                            header = TRUE,
+                            sep = "\t")[, 1]
+
+        colVc <- unlist(read.table(eval(parse(text=paste0(tabC, "FilC"))),
+                                   check.names = FALSE,
+                                   nrow=1,
+                                   sep = "\t"))[-1]
+
+        if(any(duplicated(rowVc)))
+            stop("The following row name(s) is/are duplicated in the ",
+                 tabNamC,
+                 " table: '",
+                 paste(rowVc[duplicated(rowVc)], collapse="', '"), "'",
+                 call.=FALSE)
+
+        if(any(duplicated(colVc)))
+            stop("The following column name(s) is/are duplicated in the ",
+                 tabNamC,
+                 " table: '",
+                 paste(colVc[duplicated(colVc)], collapse="', '"), "'",
+                 call.=FALSE)
+
+        rowMakVc <- make.names(rowVc, unique = TRUE)
+
+        rowDifVl <- rowVc != rowMakVc
+
+        if(any(rowDifVl)) {
+            rowDifDF <- data.frame(row = 1:length(rowVc),
+                                   actual = rowVc,
+                                   preferred = rowMakVc)
+            rowDifDF <- rowDifDF[rowDifVl, , drop = FALSE]
+            cat("\n\nWarning: The following row names of the ",
+                tabNamC,
+                " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="")
+            print(rowDifDF)
+        }
+
+        colMakVc <- make.names(colVc, unique = TRUE)
+
+        colDifVl <- colVc != colMakVc
+
+        if(any(colDifVl)) {
+            colDifDF <- data.frame(col = 1:length(colVc),
+                                   actual = colVc,
+                                   preferred = colMakVc)
+            colDifDF <- colDifDF[colDifVl, , drop = FALSE]
+            cat("\n\nWarning: The following column names of the ",
+                tabNamC,
+                " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="")
+            print(colDifDF)
+        }
+    }
+
+    ## reading tables
+
+    datMN <- t(as.matrix(read.table(datFilC,
+                                    check.names = FALSE,
+                                    header = TRUE,
+                                    row.names = 1,
+                                    sep = "\t")))
+
+    samDF <- read.table(samFilC,
+                        check.names = FALSE,
+                        header = TRUE,
+                        row.names = 1,
+                        sep = "\t")
+
+    varDF <- read.table(varFilC,
+                        check.names = FALSE,
+                        header = TRUE,
+                        row.names = 1,
+                        sep = "\t")
+
+    ## checking formats
+
+    chkL <- TRUE
+
+    if(!identical(rownames(datMN), rownames(samDF))) {
+        ## checking sample names
+
+        chkL <- FALSE
+
+        datSamDifVc <- setdiff(rownames(datMN), rownames(samDF))
+
+        if(length(datSamDifVc)) {
+            cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="")
+            print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))),
+                                   name = datSamDifVc))
+        }
+
+        samDatDifVc <- setdiff(rownames(samDF), rownames(datMN))
+
+        if(length(samDatDifVc)) {
+            cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="")
+            print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))),
+                                   name = samDatDifVc))
+        }
+
+        if(nrow(datMN) != nrow(samDF)) {
+            cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="")
+        } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) {
+            cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="")
+        } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) {
+            cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="")
+        } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) {
+            cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="")
+            print(cbind.data.frame(indice = 1:nrow(datMN),
+                                   dataMatrix_columnnames=rownames(datMN),
+                                   sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
+        } else {
+            cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="")
+            print(cbind.data.frame(indice = 1:nrow(datMN),
+                                   dataMatrix_columnnames=rownames(datMN),
+                                   sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
+        }
+
+    }
+
+    if(!identical(colnames(datMN), rownames(varDF))) {
+        ## checking variable names
+
+        chkL <- FALSE
+
+        datVarDifVc <- setdiff(colnames(datMN), rownames(varDF))
+
+        if(length(datVarDifVc)) {
+            cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="")
+            print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))),
+                        name = datVarDifVc))
+
+        }
+
+        varDatDifVc <- setdiff(rownames(varDF), colnames(datMN))
+
+        if(length(varDatDifVc)) {
+            cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="")
+            print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))),
+                        name = varDatDifVc))
+        }
+
+        if(ncol(datMN) != nrow(varDF)) {
+            cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="")
+        } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) {
+            cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="")
+            print(cbind.data.frame(row = 1:ncol(datMN),
+                                   dataMatrix_rownames=colnames(datMN),
+                                   variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
+        } else {
+            cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="")
+            print(cbind.data.frame(row = 1:ncol(datMN),
+                                   dataMatrix_rownames=colnames(datMN),
+                                   variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
+        }
+    }
+
+    options(stringsAsFactors=optStrAsFacL)
+
+    resLs <- list(chkL=chkL)
+
+    return(resLs)
+
+} ## end of checkAndReadF