Mercurial > repos > ethevenot > checkformat
diff checkformat_script.R @ 3:80a38d36f946 draft
planemo upload for repository https://github.com/workflow4metabolomics/checkformat.git commit 5cf3f6eb62c1396ade1b068a3dd3cc2e3f827e15
author | ethevenot |
---|---|
date | Thu, 11 Jan 2018 10:24:56 -0500 |
parents | e194eec8e70c |
children | 9590fac86f63 |
line wrap: on
line diff
--- a/checkformat_script.R Tue Jun 06 11:51:33 2017 -0400 +++ b/checkformat_script.R Thu Jan 11 10:24:56 2018 -0500 @@ -3,16 +3,20 @@ ## etienne.thevenot@cea.fr + ## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files ## and checks the formats readAndCheckF <- function(datFilC="dataMatrix.tsv", samFilC="sampleMetadata.tsv", - varFilC="variableMetadata.tsv") { + varFilC="variableMetadata.tsv", + makNamL) { + ## options optStrAsFacL <- options()[["stringsAsFactors"]] options(stringsAsFactors = FALSE) + ## checking that the tables have no duplicated row or column names @@ -44,36 +48,8 @@ paste(colVc[duplicated(colVc)], collapse="', '"), "'", call.=FALSE) - rowMakVc <- make.names(rowVc, unique = TRUE) - - rowDifVl <- rowVc != rowMakVc - - if(any(rowDifVl)) { - rowDifDF <- data.frame(row = 1:length(rowVc), - actual = rowVc, - preferred = rowMakVc) - rowDifDF <- rowDifDF[rowDifVl, , drop = FALSE] - cat("\n\nWarning: The following row names of the ", - tabNamC, - " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="") - print(rowDifDF) - } - - colMakVc <- make.names(colVc, unique = TRUE) - - colDifVl <- colVc != colMakVc - - if(any(colDifVl)) { - colDifDF <- data.frame(col = 1:length(colVc), - actual = colVc, - preferred = colMakVc) - colDifDF <- colDifDF[colDifVl, , drop = FALSE] - cat("\n\nWarning: The following column names of the ", - tabNamC, - " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="") - print(colDifDF) - } } + ## reading tables @@ -94,93 +70,302 @@ header = TRUE, row.names = 1, sep = "\t") + - ## checking formats + ## checking that dataMatrix is numeric and that the sample and variable numbers are coherent + + if(mode(datMN) != "numeric") { + stop("The dataMatrix is not of the 'numeric' type", + call. = FALSE) + } + + if(nrow(datMN) != nrow(samDF)) { + if(nrow(datMN) > nrow(samDF)) { + print(setdiff(rownames(datMN), rownames(samDF))) + stop("The sample names above from dataMatrix were not found in sampleMetadata") + } else { + print(setdiff(rownames(samDF), rownames(datMN))) + stop("The sample names above from sampleMetadata were not found in dataMatrix") + } + } + + if(ncol(datMN) != nrow(varDF)) { + if(ncol(datMN) > nrow(varDF)) { + print(setdiff(colnames(datMN), rownames(varDF))) + stop("The variable names above from dataMatrix were not found in variableMetadata") + } else { + print(setdiff(rownames(varDF), colnames(datMN))) + stop("The variable names above from variableMetadata were not found in dataMatrix") + } + } + + + ## making sample and variable names (optional) + + newL <- FALSE + + if(makNamL) { + + cat("\n\nMessage: Converting sample and variable names to the standard R format\n") + + rownames(datMN) <- make.names(rownames(datMN), unique = TRUE) + colnames(datMN) <- make.names(colnames(datMN), unique = TRUE) + rownames(samDF) <- make.names(rownames(samDF), unique = TRUE) + rownames(varDF) <- make.names(rownames(varDF), unique = TRUE) + + newL <- TRUE + + } + + + ## checking sample and variable names chkL <- TRUE if(!identical(rownames(datMN), rownames(samDF))) { - ## checking sample names - chkL <- FALSE - - datSamDifVc <- setdiff(rownames(datMN), rownames(samDF)) + if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { - if(length(datSamDifVc)) { - cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="") - print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))), - name = datSamDifVc)) - } - - samDatDifVc <- setdiff(rownames(samDF), rownames(datMN)) + cat("\n\nMessage: Re-ordering dataMatrix sample names to match sampleMetadata\n") + datMN <- datMN[rownames(samDF), , drop = FALSE] + + stopifnot(identical(sort(rownames(datMN)), sort(rownames(samDF)))) - if(length(samDatDifVc)) { - cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="") - print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))), - name = samDatDifVc)) - } - - if(nrow(datMN) != nrow(samDF)) { - cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="") - } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) { - cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="") - } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) { - cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="") - } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { - cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="") + newL <- TRUE + + } else { + + cat("\n\nStop: The sample names of dataMatrix and sampleMetadata do not match:\n") print(cbind.data.frame(indice = 1:nrow(datMN), - dataMatrix_columnnames=rownames(datMN), - sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) - } else { - cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="") - print(cbind.data.frame(indice = 1:nrow(datMN), - dataMatrix_columnnames=rownames(datMN), - sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) + dataMatrix=rownames(datMN), + sampleMetadata=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) + chkL <- FALSE + } } if(!identical(colnames(datMN), rownames(varDF))) { - ## checking variable names + + if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) { + + cat("\n\nMessage: Re-ordering dataMatrix variable names to match variableMetadata\n") + datMN <- datMN[, rownames(varDF), drop = FALSE] - chkL <- FALSE - - datVarDifVc <- setdiff(colnames(datMN), rownames(varDF)) + stopifnot(identical(sort(colnames(datMN)), sort(rownames(varDF)))) - if(length(datVarDifVc)) { - cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="") - print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))), - name = datVarDifVc)) - + newL <- TRUE + + } else { + + cat("\n\nStop: The variable names of dataMatrix and variableMetadata do not match:\n") + print(cbind.data.frame(indice = 1:ncol(datMN), + dataMatrix=colnames(datMN), + variableMetadata=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) + chkL <- FALSE + } - varDatDifVc <- setdiff(rownames(varDF), colnames(datMN)) - - if(length(varDatDifVc)) { - cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="") - print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))), - name = varDatDifVc)) - } - - if(ncol(datMN) != nrow(varDF)) { - cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="") - } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) { - cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="") - print(cbind.data.frame(row = 1:ncol(datMN), - dataMatrix_rownames=colnames(datMN), - variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) - } else { - cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="") - print(cbind.data.frame(row = 1:ncol(datMN), - dataMatrix_rownames=colnames(datMN), - variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) - } } + options(stringsAsFactors=optStrAsFacL) - resLs <- list(chkL=chkL) + resLs <- list(chkL=chkL, + newL = newL, + datMN = datMN, + samDF = samDF, + varDF = varDF) return(resLs) } ## end of checkAndReadF + + + + + ## if(!identical(rownames(datMN), rownames(samDF))) { + ## ## checking sample names + + ## chkL <- FALSE + + ## datSamDifVc <- setdiff(rownames(datMN), rownames(samDF)) + + ## if(length(datSamDifVc)) { + ## cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="") + ## print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))), + ## name = datSamDifVc)) + ## } + + ## samDatDifVc <- setdiff(rownames(samDF), rownames(datMN)) + + ## if(length(samDatDifVc)) { + ## cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="") + ## print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))), + ## name = samDatDifVc)) + ## } + + ## if(nrow(datMN) != nrow(samDF)) { + ## cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="") + ## } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) { + ## cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="") + ## } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) { + ## cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="") + ## } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { + ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="") + ## print(cbind.data.frame(indice = 1:nrow(datMN), + ## dataMatrix_columnnames=rownames(datMN), + ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) + ## } else { + ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="") + ## print(cbind.data.frame(indice = 1:nrow(datMN), + ## dataMatrix_columnnames=rownames(datMN), + ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) + ## } + + ## } + ## datRowVc <- rownames(datMN) + ## datRowMakVc <- make.names(datRowVc, unique = TRUE) + ## if(datRowMakVc != datRowVc) { + ## rownames(datMN) <- datRowMakVc + ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format\n") + ## } + + ## datColVc <- colnames(datMN) + ## datColMakVc <- make.names(datColVc, unique = TRUE) + ## if(datColMakVc != datColVc) { + ## colnames(datMN) <- datColMakVc + ## cat("\n\nMessage: The variable names of the dataMatrix have been converted to the standard R format\n") + ## } + + ## samRowVc <- rownames(datMN) + ## samRowMakVc <- make.names(samRowVc, unique = TRUE) + ## if(samRowMakVc != samRowVc) { + ## rownames(datMN) <- samRowMakVc + ## cat("\n\nMessage: The sample names of the sampleMetadata have been converted to the standard R format\n") + ## } + + ## datRowVc <- rownames(datMN) + ## datRowMakVc <- make.names(datRowVc, unique = TRUE) + ## if(datRowMakVc != datRowVc) { + ## rownames(datMN) <- datRowMakVc + ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format\n") + ## } + + ## } + + ## checking names (optional) + + + + + ## datRowMakVc <- make.names(datRowVc, unique = TRUE) + ## if(datRowMakVc != datRowVc) { + ## if(makNamL) { + ## rownames(datMN) <- datRowMakVc + ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format; select the make names argument to convert them\n") + ## } else { + ## cat("\n\nWarning: Some of the sample names of the dataMatrix are not in the standard R format; select the make names argument to convert them\n") + ## } + + ## if(makNamL) { + + ## rownames(datMN) <- make.names(rownames(datMN), unique = TRUE) + ## colnames(datMN) <- make.names(colnames(datMN), unique = TRUE) + ## rownames(samDF) <- make.names(rownames(samDF), unique = TRUE) + ## rownames(varDF) <- make.names(rownames(varDF), unique = TRUE) + + ## } + ## checking sample names + + ## if(nrow(datMN) == nrow(samDF)) { + + + + ## } + + ## chkL <- FALSE + + ## datSamDifVc <- setdiff(rownames(datMN), rownames(samDF)) + + ## if(length(datSamDifVc)) { + ## cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="") + ## print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))), + ## name = datSamDifVc)) + ## } + + ## samDatDifVc <- setdiff(rownames(samDF), rownames(datMN)) + + ## if(length(samDatDifVc)) { + ## cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="") + ## print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))), + ## name = samDatDifVc)) + ## } + + ## if(nrow(datMN) != nrow(samDF)) { + ## cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="") + ## } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) { + ## cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="") + ## } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) { + ## cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="") + ## } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { + ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="") + ## print(cbind.data.frame(indice = 1:nrow(datMN), + ## dataMatrix_columnnames=rownames(datMN), + ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) + ## } else { + ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="") + ## print(cbind.data.frame(indice = 1:nrow(datMN), + ## dataMatrix_columnnames=rownames(datMN), + ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) + ## } + + ## } + + + ## if(!identical(colnames(datMN), rownames(varDF))) { + ## ## checking variable names + + ## chkL <- FALSE + + ## datVarDifVc <- setdiff(colnames(datMN), rownames(varDF)) + + ## if(length(datVarDifVc)) { + ## cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="") + ## print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))), + ## name = datVarDifVc)) + + ## } + + ## varDatDifVc <- setdiff(rownames(varDF), colnames(datMN)) + + ## if(length(varDatDifVc)) { + ## cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="") + ## print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))), + ## name = varDatDifVc)) + ## } + + ## if(ncol(datMN) != nrow(varDF)) { + ## cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="") + ## } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) { + ## cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="") + ## print(cbind.data.frame(row = 1:ncol(datMN), + ## dataMatrix_rownames=colnames(datMN), + ## variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) + ## } else { + ## cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="") + ## print(cbind.data.frame(row = 1:ncol(datMN), + ## dataMatrix_rownames=colnames(datMN), + ## variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) + ## } + ## } +## checkF <- function(datInpMN, +## samInpDF, +## varInpDF) { + +## mode(datInpMN) == "numeric" && +## identical(rownames(datInpMN), rownames(samInpDF)) && +## identical(colnames(datInpMN), rownames(varInpDF)) + + +## }