Mercurial > repos > ethevenot > checkformat
comparison checkformat_script.R @ 1:e194eec8e70c draft
planemo upload for repository https://github.com/workflow4metabolomics/checkformat.git commit 8ebfbfa8d9449c9bbfbf569851a30b1e33df0b3f
| author | ethevenot |
|---|---|
| date | Sat, 06 Aug 2016 11:54:28 -0400 |
| parents | |
| children | 80a38d36f946 |
comparison
equal
deleted
inserted
replaced
| 0:0d8099822c49 | 1:e194eec8e70c |
|---|---|
| 1 ## Etienne Thevenot | |
| 2 ## CEA, MetaboHUB Paris | |
| 3 ## etienne.thevenot@cea.fr | |
| 4 | |
| 5 | |
| 6 ## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files | |
| 7 ## and checks the formats | |
| 8 readAndCheckF <- function(datFilC="dataMatrix.tsv", | |
| 9 samFilC="sampleMetadata.tsv", | |
| 10 varFilC="variableMetadata.tsv") { | |
| 11 | |
| 12 ## options | |
| 13 | |
| 14 optStrAsFacL <- options()[["stringsAsFactors"]] | |
| 15 options(stringsAsFactors = FALSE) | |
| 16 | |
| 17 ## checking that the tables have no duplicated row or column names | |
| 18 | |
| 19 for(tabC in c("dat", "sam", "var")) { | |
| 20 | |
| 21 tabNamC <- switch(tabC, dat="dataMatrix", sam="sampleMetadata", var="variableMetadata") | |
| 22 | |
| 23 rowVc <- read.table(eval(parse(text=paste0(tabC, "FilC"))), | |
| 24 check.names = FALSE, | |
| 25 header = TRUE, | |
| 26 sep = "\t")[, 1] | |
| 27 | |
| 28 colVc <- unlist(read.table(eval(parse(text=paste0(tabC, "FilC"))), | |
| 29 check.names = FALSE, | |
| 30 nrow=1, | |
| 31 sep = "\t"))[-1] | |
| 32 | |
| 33 if(any(duplicated(rowVc))) | |
| 34 stop("The following row name(s) is/are duplicated in the ", | |
| 35 tabNamC, | |
| 36 " table: '", | |
| 37 paste(rowVc[duplicated(rowVc)], collapse="', '"), "'", | |
| 38 call.=FALSE) | |
| 39 | |
| 40 if(any(duplicated(colVc))) | |
| 41 stop("The following column name(s) is/are duplicated in the ", | |
| 42 tabNamC, | |
| 43 " table: '", | |
| 44 paste(colVc[duplicated(colVc)], collapse="', '"), "'", | |
| 45 call.=FALSE) | |
| 46 | |
| 47 rowMakVc <- make.names(rowVc, unique = TRUE) | |
| 48 | |
| 49 rowDifVl <- rowVc != rowMakVc | |
| 50 | |
| 51 if(any(rowDifVl)) { | |
| 52 rowDifDF <- data.frame(row = 1:length(rowVc), | |
| 53 actual = rowVc, | |
| 54 preferred = rowMakVc) | |
| 55 rowDifDF <- rowDifDF[rowDifVl, , drop = FALSE] | |
| 56 cat("\n\nWarning: The following row names of the ", | |
| 57 tabNamC, | |
| 58 " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="") | |
| 59 print(rowDifDF) | |
| 60 } | |
| 61 | |
| 62 colMakVc <- make.names(colVc, unique = TRUE) | |
| 63 | |
| 64 colDifVl <- colVc != colMakVc | |
| 65 | |
| 66 if(any(colDifVl)) { | |
| 67 colDifDF <- data.frame(col = 1:length(colVc), | |
| 68 actual = colVc, | |
| 69 preferred = colMakVc) | |
| 70 colDifDF <- colDifDF[colDifVl, , drop = FALSE] | |
| 71 cat("\n\nWarning: The following column names of the ", | |
| 72 tabNamC, | |
| 73 " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="") | |
| 74 print(colDifDF) | |
| 75 } | |
| 76 } | |
| 77 | |
| 78 ## reading tables | |
| 79 | |
| 80 datMN <- t(as.matrix(read.table(datFilC, | |
| 81 check.names = FALSE, | |
| 82 header = TRUE, | |
| 83 row.names = 1, | |
| 84 sep = "\t"))) | |
| 85 | |
| 86 samDF <- read.table(samFilC, | |
| 87 check.names = FALSE, | |
| 88 header = TRUE, | |
| 89 row.names = 1, | |
| 90 sep = "\t") | |
| 91 | |
| 92 varDF <- read.table(varFilC, | |
| 93 check.names = FALSE, | |
| 94 header = TRUE, | |
| 95 row.names = 1, | |
| 96 sep = "\t") | |
| 97 | |
| 98 ## checking formats | |
| 99 | |
| 100 chkL <- TRUE | |
| 101 | |
| 102 if(!identical(rownames(datMN), rownames(samDF))) { | |
| 103 ## checking sample names | |
| 104 | |
| 105 chkL <- FALSE | |
| 106 | |
| 107 datSamDifVc <- setdiff(rownames(datMN), rownames(samDF)) | |
| 108 | |
| 109 if(length(datSamDifVc)) { | |
| 110 cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="") | |
| 111 print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))), | |
| 112 name = datSamDifVc)) | |
| 113 } | |
| 114 | |
| 115 samDatDifVc <- setdiff(rownames(samDF), rownames(datMN)) | |
| 116 | |
| 117 if(length(samDatDifVc)) { | |
| 118 cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="") | |
| 119 print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))), | |
| 120 name = samDatDifVc)) | |
| 121 } | |
| 122 | |
| 123 if(nrow(datMN) != nrow(samDF)) { | |
| 124 cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="") | |
| 125 } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) { | |
| 126 cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="") | |
| 127 } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) { | |
| 128 cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="") | |
| 129 } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { | |
| 130 cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="") | |
| 131 print(cbind.data.frame(indice = 1:nrow(datMN), | |
| 132 dataMatrix_columnnames=rownames(datMN), | |
| 133 sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) | |
| 134 } else { | |
| 135 cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="") | |
| 136 print(cbind.data.frame(indice = 1:nrow(datMN), | |
| 137 dataMatrix_columnnames=rownames(datMN), | |
| 138 sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) | |
| 139 } | |
| 140 | |
| 141 } | |
| 142 | |
| 143 if(!identical(colnames(datMN), rownames(varDF))) { | |
| 144 ## checking variable names | |
| 145 | |
| 146 chkL <- FALSE | |
| 147 | |
| 148 datVarDifVc <- setdiff(colnames(datMN), rownames(varDF)) | |
| 149 | |
| 150 if(length(datVarDifVc)) { | |
| 151 cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="") | |
| 152 print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))), | |
| 153 name = datVarDifVc)) | |
| 154 | |
| 155 } | |
| 156 | |
| 157 varDatDifVc <- setdiff(rownames(varDF), colnames(datMN)) | |
| 158 | |
| 159 if(length(varDatDifVc)) { | |
| 160 cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="") | |
| 161 print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))), | |
| 162 name = varDatDifVc)) | |
| 163 } | |
| 164 | |
| 165 if(ncol(datMN) != nrow(varDF)) { | |
| 166 cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="") | |
| 167 } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) { | |
| 168 cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="") | |
| 169 print(cbind.data.frame(row = 1:ncol(datMN), | |
| 170 dataMatrix_rownames=colnames(datMN), | |
| 171 variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) | |
| 172 } else { | |
| 173 cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="") | |
| 174 print(cbind.data.frame(row = 1:ncol(datMN), | |
| 175 dataMatrix_rownames=colnames(datMN), | |
| 176 variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) | |
| 177 } | |
| 178 } | |
| 179 | |
| 180 options(stringsAsFactors=optStrAsFacL) | |
| 181 | |
| 182 resLs <- list(chkL=chkL) | |
| 183 | |
| 184 return(resLs) | |
| 185 | |
| 186 } ## end of checkAndReadF |
