diff checkformat_script.R @ 3:80a38d36f946 draft

planemo upload for repository https://github.com/workflow4metabolomics/checkformat.git commit 5cf3f6eb62c1396ade1b068a3dd3cc2e3f827e15
author ethevenot
date Thu, 11 Jan 2018 10:24:56 -0500
parents e194eec8e70c
children 9590fac86f63
line wrap: on
line diff
--- a/checkformat_script.R	Tue Jun 06 11:51:33 2017 -0400
+++ b/checkformat_script.R	Thu Jan 11 10:24:56 2018 -0500
@@ -3,16 +3,20 @@
 ## etienne.thevenot@cea.fr
 
 
+
 ## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files
 ## and checks the formats
 readAndCheckF <- function(datFilC="dataMatrix.tsv",
                           samFilC="sampleMetadata.tsv",
-                          varFilC="variableMetadata.tsv") {
+                          varFilC="variableMetadata.tsv",
+                          makNamL) {
+    
 
     ## options
 
     optStrAsFacL <- options()[["stringsAsFactors"]]
     options(stringsAsFactors = FALSE)
+    
 
     ## checking that the tables have no duplicated row or column names
 
@@ -44,36 +48,8 @@
                  paste(colVc[duplicated(colVc)], collapse="', '"), "'",
                  call.=FALSE)
 
-        rowMakVc <- make.names(rowVc, unique = TRUE)
-
-        rowDifVl <- rowVc != rowMakVc
-
-        if(any(rowDifVl)) {
-            rowDifDF <- data.frame(row = 1:length(rowVc),
-                                   actual = rowVc,
-                                   preferred = rowMakVc)
-            rowDifDF <- rowDifDF[rowDifVl, , drop = FALSE]
-            cat("\n\nWarning: The following row names of the ",
-                tabNamC,
-                " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="")
-            print(rowDifDF)
-        }
-
-        colMakVc <- make.names(colVc, unique = TRUE)
-
-        colDifVl <- colVc != colMakVc
-
-        if(any(colDifVl)) {
-            colDifDF <- data.frame(col = 1:length(colVc),
-                                   actual = colVc,
-                                   preferred = colMakVc)
-            colDifDF <- colDifDF[colDifVl, , drop = FALSE]
-            cat("\n\nWarning: The following column names of the ",
-                tabNamC,
-                " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="")
-            print(colDifDF)
-        }
     }
+    
 
     ## reading tables
 
@@ -94,93 +70,302 @@
                         header = TRUE,
                         row.names = 1,
                         sep = "\t")
+    
 
-    ## checking formats
+    ## checking that dataMatrix is numeric and that the sample and variable numbers are coherent
+
+    if(mode(datMN) != "numeric") {
+        stop("The dataMatrix is not of the 'numeric' type",
+             call. = FALSE)
+    }
+    
+    if(nrow(datMN) != nrow(samDF)) {
+        if(nrow(datMN) > nrow(samDF)) {
+            print(setdiff(rownames(datMN), rownames(samDF)))
+            stop("The sample names above from dataMatrix were not found in sampleMetadata")
+        } else {
+            print(setdiff(rownames(samDF), rownames(datMN)))
+            stop("The sample names above from sampleMetadata were not found in dataMatrix")
+        }
+    }
+
+    if(ncol(datMN) != nrow(varDF)) {
+        if(ncol(datMN) > nrow(varDF)) {
+            print(setdiff(colnames(datMN), rownames(varDF)))
+            stop("The variable names above from dataMatrix were not found in variableMetadata")
+        } else {
+            print(setdiff(rownames(varDF), colnames(datMN)))
+            stop("The variable names above from variableMetadata were not found in dataMatrix")
+        }
+    }
+    
+
+    ## making sample and variable names (optional)
+
+    newL <- FALSE
+
+    if(makNamL) {
+
+        cat("\n\nMessage: Converting sample and variable names to the standard R format\n")
+
+        rownames(datMN) <- make.names(rownames(datMN), unique = TRUE)
+        colnames(datMN) <- make.names(colnames(datMN), unique = TRUE)
+        rownames(samDF) <- make.names(rownames(samDF), unique = TRUE)
+        rownames(varDF) <- make.names(rownames(varDF), unique = TRUE)
+
+        newL <- TRUE
+
+    }
+    
+ 
+    ## checking sample and variable names
 
     chkL <- TRUE
 
     if(!identical(rownames(datMN), rownames(samDF))) {
-        ## checking sample names
 
-        chkL <- FALSE
-
-        datSamDifVc <- setdiff(rownames(datMN), rownames(samDF))
+        if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) {
 
-        if(length(datSamDifVc)) {
-            cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="")
-            print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))),
-                                   name = datSamDifVc))
-        }
-
-        samDatDifVc <- setdiff(rownames(samDF), rownames(datMN))
+            cat("\n\nMessage: Re-ordering dataMatrix sample names to match sampleMetadata\n")
+            datMN <- datMN[rownames(samDF), , drop = FALSE]
+            
+            stopifnot(identical(sort(rownames(datMN)), sort(rownames(samDF))))
 
-        if(length(samDatDifVc)) {
-            cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="")
-            print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))),
-                                   name = samDatDifVc))
-        }
-
-        if(nrow(datMN) != nrow(samDF)) {
-            cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="")
-        } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) {
-            cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="")
-        } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) {
-            cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="")
-        } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) {
-            cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="")
+            newL <- TRUE
+            
+        }  else {
+            
+            cat("\n\nStop: The sample names of dataMatrix and sampleMetadata do not match:\n")
             print(cbind.data.frame(indice = 1:nrow(datMN),
-                                   dataMatrix_columnnames=rownames(datMN),
-                                   sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
-        } else {
-            cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="")
-            print(cbind.data.frame(indice = 1:nrow(datMN),
-                                   dataMatrix_columnnames=rownames(datMN),
-                                   sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
+                                   dataMatrix=rownames(datMN),
+                                   sampleMetadata=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
+            chkL <- FALSE
+            
         }
 
     }
 
     if(!identical(colnames(datMN), rownames(varDF))) {
-        ## checking variable names
+        
+        if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) {
+            
+            cat("\n\nMessage: Re-ordering dataMatrix variable names to match variableMetadata\n")
+            datMN <- datMN[, rownames(varDF), drop = FALSE]
 
-        chkL <- FALSE
-
-        datVarDifVc <- setdiff(colnames(datMN), rownames(varDF))
+            stopifnot(identical(sort(colnames(datMN)), sort(rownames(varDF))))
 
-        if(length(datVarDifVc)) {
-            cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="")
-            print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))),
-                        name = datVarDifVc))
-
+            newL <- TRUE
+            
+        }  else {
+            
+            cat("\n\nStop: The variable names of dataMatrix and variableMetadata do not match:\n")
+            print(cbind.data.frame(indice = 1:ncol(datMN),
+                                   dataMatrix=colnames(datMN),
+                                   variableMetadata=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
+            chkL <- FALSE
+            
         }
 
-        varDatDifVc <- setdiff(rownames(varDF), colnames(datMN))
-
-        if(length(varDatDifVc)) {
-            cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="")
-            print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))),
-                        name = varDatDifVc))
-        }
-
-        if(ncol(datMN) != nrow(varDF)) {
-            cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="")
-        } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) {
-            cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="")
-            print(cbind.data.frame(row = 1:ncol(datMN),
-                                   dataMatrix_rownames=colnames(datMN),
-                                   variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
-        } else {
-            cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="")
-            print(cbind.data.frame(row = 1:ncol(datMN),
-                                   dataMatrix_rownames=colnames(datMN),
-                                   variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
-        }
     }
+        
 
     options(stringsAsFactors=optStrAsFacL)
 
-    resLs <- list(chkL=chkL)
+    resLs <- list(chkL=chkL,
+                  newL = newL,
+                  datMN = datMN,
+                  samDF = samDF,
+                  varDF = varDF)
 
     return(resLs)
 
 } ## end of checkAndReadF
+
+
+
+
+    ## if(!identical(rownames(datMN), rownames(samDF))) {
+    ##     ## checking sample names
+
+    ##     chkL <- FALSE
+
+    ##     datSamDifVc <- setdiff(rownames(datMN), rownames(samDF))
+
+    ##     if(length(datSamDifVc)) {
+    ##         cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="")
+    ##         print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))),
+    ##                                name = datSamDifVc))
+    ##     }
+
+    ##     samDatDifVc <- setdiff(rownames(samDF), rownames(datMN))
+
+    ##     if(length(samDatDifVc)) {
+    ##         cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="")
+    ##         print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))),
+    ##                                name = samDatDifVc))
+    ##     }
+
+    ##     if(nrow(datMN) != nrow(samDF)) {
+    ##         cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="")
+    ##     } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) {
+    ##         cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="")
+    ##     } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) {
+    ##         cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="")
+    ##     } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) {
+    ##         cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="")
+    ##         print(cbind.data.frame(indice = 1:nrow(datMN),
+    ##                                dataMatrix_columnnames=rownames(datMN),
+    ##                                sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
+    ##     } else {
+    ##         cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="")
+    ##         print(cbind.data.frame(indice = 1:nrow(datMN),
+    ##                                dataMatrix_columnnames=rownames(datMN),
+    ##                                sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
+    ##     }
+
+    ## }
+    ##     datRowVc <- rownames(datMN)
+    ##     datRowMakVc <- make.names(datRowVc, unique = TRUE)
+    ##     if(datRowMakVc != datRowVc) {
+    ##         rownames(datMN) <- datRowMakVc
+    ##         cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format\n")
+    ##     }
+
+    ##     datColVc <- colnames(datMN)
+    ##     datColMakVc <- make.names(datColVc, unique = TRUE)
+    ##     if(datColMakVc != datColVc) {
+    ##         colnames(datMN) <- datColMakVc
+    ##         cat("\n\nMessage: The variable names of the dataMatrix have been converted to the standard R format\n")
+    ##     }
+
+    ##     samRowVc <- rownames(datMN)
+    ##     samRowMakVc <- make.names(samRowVc, unique = TRUE)
+    ##     if(samRowMakVc != samRowVc) {
+    ##         rownames(datMN) <- samRowMakVc
+    ##         cat("\n\nMessage: The sample names of the sampleMetadata have been converted to the standard R format\n")
+    ##     }
+
+    ##     datRowVc <- rownames(datMN)
+    ##     datRowMakVc <- make.names(datRowVc, unique = TRUE)
+    ##     if(datRowMakVc != datRowVc) {
+    ##         rownames(datMN) <- datRowMakVc
+    ##         cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format\n")
+    ##     }
+
+    ## }
+
+    ## checking names (optional)
+
+    
+
+
+    ## datRowMakVc <- make.names(datRowVc, unique = TRUE)
+    ## if(datRowMakVc != datRowVc) {
+    ##     if(makNamL) {
+    ##         rownames(datMN) <- datRowMakVc
+    ##         cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format; select the make names argument to convert them\n")
+    ##     } else {
+    ##         cat("\n\nWarning: Some of the sample names of the dataMatrix are not in the standard R format; select the make names argument to convert them\n")
+    ##     }
+
+    ## if(makNamL) {
+
+    ##     rownames(datMN) <- make.names(rownames(datMN), unique = TRUE)
+    ##     colnames(datMN) <- make.names(colnames(datMN), unique = TRUE)
+    ##     rownames(samDF) <- make.names(rownames(samDF), unique = TRUE)
+    ##     rownames(varDF) <- make.names(rownames(varDF), unique = TRUE)
+ 
+    ##     }
+        ## checking sample names
+
+    ##     if(nrow(datMN) == nrow(samDF)) {
+
+            
+            
+    ##     }
+
+    ##     chkL <- FALSE
+
+    ##     datSamDifVc <- setdiff(rownames(datMN), rownames(samDF))
+
+    ##     if(length(datSamDifVc)) {
+    ##         cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="")
+    ##         print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))),
+    ##                                name = datSamDifVc))
+    ##     }
+
+    ##     samDatDifVc <- setdiff(rownames(samDF), rownames(datMN))
+
+    ##     if(length(samDatDifVc)) {
+    ##         cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="")
+    ##         print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))),
+    ##                                name = samDatDifVc))
+    ##     }
+
+    ##     if(nrow(datMN) != nrow(samDF)) {
+    ##         cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="")
+    ##     } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) {
+    ##         cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="")
+    ##     } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) {
+    ##         cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="")
+    ##     } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) {
+    ##         cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="")
+    ##         print(cbind.data.frame(indice = 1:nrow(datMN),
+    ##                                dataMatrix_columnnames=rownames(datMN),
+    ##                                sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
+    ##     } else {
+    ##         cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="")
+    ##         print(cbind.data.frame(indice = 1:nrow(datMN),
+    ##                                dataMatrix_columnnames=rownames(datMN),
+    ##                                sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
+    ##     }
+
+    ## }
+
+
+    ## if(!identical(colnames(datMN), rownames(varDF))) {
+    ##     ## checking variable names
+
+    ##     chkL <- FALSE
+
+    ##     datVarDifVc <- setdiff(colnames(datMN), rownames(varDF))
+
+    ##     if(length(datVarDifVc)) {
+    ##         cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="")
+    ##         print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))),
+    ##                     name = datVarDifVc))
+
+    ##     }
+
+    ##     varDatDifVc <- setdiff(rownames(varDF), colnames(datMN))
+
+    ##     if(length(varDatDifVc)) {
+    ##         cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="")
+    ##         print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))),
+    ##                     name = varDatDifVc))
+    ##     }
+
+    ##     if(ncol(datMN) != nrow(varDF)) {
+    ##         cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="")
+    ##     } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) {
+    ##         cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="")
+    ##         print(cbind.data.frame(row = 1:ncol(datMN),
+    ##                                dataMatrix_rownames=colnames(datMN),
+    ##                                variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
+    ##     } else {
+    ##         cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="")
+    ##         print(cbind.data.frame(row = 1:ncol(datMN),
+    ##                                dataMatrix_rownames=colnames(datMN),
+    ##                                variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
+    ##     }
+    ## }
+## checkF <- function(datInpMN,
+##                    samInpDF,
+##                    varInpDF) {
+
+##     mode(datInpMN) == "numeric" &&
+##         identical(rownames(datInpMN), rownames(samInpDF)) &&
+##         identical(colnames(datInpMN), rownames(varInpDF))
+    
+
+## }