comparison checkformat_script.R @ 3:80a38d36f946 draft

planemo upload for repository https://github.com/workflow4metabolomics/checkformat.git commit 5cf3f6eb62c1396ade1b068a3dd3cc2e3f827e15
author ethevenot
date Thu, 11 Jan 2018 10:24:56 -0500
parents e194eec8e70c
children 9590fac86f63
comparison
equal deleted inserted replaced
2:b6a6b4cc932a 3:80a38d36f946
1 ## Etienne Thevenot 1 ## Etienne Thevenot
2 ## CEA, MetaboHUB Paris 2 ## CEA, MetaboHUB Paris
3 ## etienne.thevenot@cea.fr 3 ## etienne.thevenot@cea.fr
4
4 5
5 6
6 ## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files 7 ## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files
7 ## and checks the formats 8 ## and checks the formats
8 readAndCheckF <- function(datFilC="dataMatrix.tsv", 9 readAndCheckF <- function(datFilC="dataMatrix.tsv",
9 samFilC="sampleMetadata.tsv", 10 samFilC="sampleMetadata.tsv",
10 varFilC="variableMetadata.tsv") { 11 varFilC="variableMetadata.tsv",
12 makNamL) {
13
11 14
12 ## options 15 ## options
13 16
14 optStrAsFacL <- options()[["stringsAsFactors"]] 17 optStrAsFacL <- options()[["stringsAsFactors"]]
15 options(stringsAsFactors = FALSE) 18 options(stringsAsFactors = FALSE)
19
16 20
17 ## checking that the tables have no duplicated row or column names 21 ## checking that the tables have no duplicated row or column names
18 22
19 for(tabC in c("dat", "sam", "var")) { 23 for(tabC in c("dat", "sam", "var")) {
20 24
42 tabNamC, 46 tabNamC,
43 " table: '", 47 " table: '",
44 paste(colVc[duplicated(colVc)], collapse="', '"), "'", 48 paste(colVc[duplicated(colVc)], collapse="', '"), "'",
45 call.=FALSE) 49 call.=FALSE)
46 50
47 rowMakVc <- make.names(rowVc, unique = TRUE) 51 }
48 52
49 rowDifVl <- rowVc != rowMakVc
50
51 if(any(rowDifVl)) {
52 rowDifDF <- data.frame(row = 1:length(rowVc),
53 actual = rowVc,
54 preferred = rowMakVc)
55 rowDifDF <- rowDifDF[rowDifVl, , drop = FALSE]
56 cat("\n\nWarning: The following row names of the ",
57 tabNamC,
58 " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="")
59 print(rowDifDF)
60 }
61
62 colMakVc <- make.names(colVc, unique = TRUE)
63
64 colDifVl <- colVc != colMakVc
65
66 if(any(colDifVl)) {
67 colDifDF <- data.frame(col = 1:length(colVc),
68 actual = colVc,
69 preferred = colMakVc)
70 colDifDF <- colDifDF[colDifVl, , drop = FALSE]
71 cat("\n\nWarning: The following column names of the ",
72 tabNamC,
73 " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="")
74 print(colDifDF)
75 }
76 }
77 53
78 ## reading tables 54 ## reading tables
79 55
80 datMN <- t(as.matrix(read.table(datFilC, 56 datMN <- t(as.matrix(read.table(datFilC,
81 check.names = FALSE, 57 check.names = FALSE,
92 varDF <- read.table(varFilC, 68 varDF <- read.table(varFilC,
93 check.names = FALSE, 69 check.names = FALSE,
94 header = TRUE, 70 header = TRUE,
95 row.names = 1, 71 row.names = 1,
96 sep = "\t") 72 sep = "\t")
97 73
98 ## checking formats 74
75 ## checking that dataMatrix is numeric and that the sample and variable numbers are coherent
76
77 if(mode(datMN) != "numeric") {
78 stop("The dataMatrix is not of the 'numeric' type",
79 call. = FALSE)
80 }
81
82 if(nrow(datMN) != nrow(samDF)) {
83 if(nrow(datMN) > nrow(samDF)) {
84 print(setdiff(rownames(datMN), rownames(samDF)))
85 stop("The sample names above from dataMatrix were not found in sampleMetadata")
86 } else {
87 print(setdiff(rownames(samDF), rownames(datMN)))
88 stop("The sample names above from sampleMetadata were not found in dataMatrix")
89 }
90 }
91
92 if(ncol(datMN) != nrow(varDF)) {
93 if(ncol(datMN) > nrow(varDF)) {
94 print(setdiff(colnames(datMN), rownames(varDF)))
95 stop("The variable names above from dataMatrix were not found in variableMetadata")
96 } else {
97 print(setdiff(rownames(varDF), colnames(datMN)))
98 stop("The variable names above from variableMetadata were not found in dataMatrix")
99 }
100 }
101
102
103 ## making sample and variable names (optional)
104
105 newL <- FALSE
106
107 if(makNamL) {
108
109 cat("\n\nMessage: Converting sample and variable names to the standard R format\n")
110
111 rownames(datMN) <- make.names(rownames(datMN), unique = TRUE)
112 colnames(datMN) <- make.names(colnames(datMN), unique = TRUE)
113 rownames(samDF) <- make.names(rownames(samDF), unique = TRUE)
114 rownames(varDF) <- make.names(rownames(varDF), unique = TRUE)
115
116 newL <- TRUE
117
118 }
119
120
121 ## checking sample and variable names
99 122
100 chkL <- TRUE 123 chkL <- TRUE
101 124
102 if(!identical(rownames(datMN), rownames(samDF))) { 125 if(!identical(rownames(datMN), rownames(samDF))) {
126
127 if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) {
128
129 cat("\n\nMessage: Re-ordering dataMatrix sample names to match sampleMetadata\n")
130 datMN <- datMN[rownames(samDF), , drop = FALSE]
131
132 stopifnot(identical(sort(rownames(datMN)), sort(rownames(samDF))))
133
134 newL <- TRUE
135
136 } else {
137
138 cat("\n\nStop: The sample names of dataMatrix and sampleMetadata do not match:\n")
139 print(cbind.data.frame(indice = 1:nrow(datMN),
140 dataMatrix=rownames(datMN),
141 sampleMetadata=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
142 chkL <- FALSE
143
144 }
145
146 }
147
148 if(!identical(colnames(datMN), rownames(varDF))) {
149
150 if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) {
151
152 cat("\n\nMessage: Re-ordering dataMatrix variable names to match variableMetadata\n")
153 datMN <- datMN[, rownames(varDF), drop = FALSE]
154
155 stopifnot(identical(sort(colnames(datMN)), sort(rownames(varDF))))
156
157 newL <- TRUE
158
159 } else {
160
161 cat("\n\nStop: The variable names of dataMatrix and variableMetadata do not match:\n")
162 print(cbind.data.frame(indice = 1:ncol(datMN),
163 dataMatrix=colnames(datMN),
164 variableMetadata=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
165 chkL <- FALSE
166
167 }
168
169 }
170
171
172 options(stringsAsFactors=optStrAsFacL)
173
174 resLs <- list(chkL=chkL,
175 newL = newL,
176 datMN = datMN,
177 samDF = samDF,
178 varDF = varDF)
179
180 return(resLs)
181
182 } ## end of checkAndReadF
183
184
185
186
187 ## if(!identical(rownames(datMN), rownames(samDF))) {
188 ## ## checking sample names
189
190 ## chkL <- FALSE
191
192 ## datSamDifVc <- setdiff(rownames(datMN), rownames(samDF))
193
194 ## if(length(datSamDifVc)) {
195 ## cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="")
196 ## print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))),
197 ## name = datSamDifVc))
198 ## }
199
200 ## samDatDifVc <- setdiff(rownames(samDF), rownames(datMN))
201
202 ## if(length(samDatDifVc)) {
203 ## cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="")
204 ## print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))),
205 ## name = samDatDifVc))
206 ## }
207
208 ## if(nrow(datMN) != nrow(samDF)) {
209 ## cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="")
210 ## } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) {
211 ## cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="")
212 ## } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) {
213 ## cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="")
214 ## } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) {
215 ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="")
216 ## print(cbind.data.frame(indice = 1:nrow(datMN),
217 ## dataMatrix_columnnames=rownames(datMN),
218 ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
219 ## } else {
220 ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="")
221 ## print(cbind.data.frame(indice = 1:nrow(datMN),
222 ## dataMatrix_columnnames=rownames(datMN),
223 ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
224 ## }
225
226 ## }
227 ## datRowVc <- rownames(datMN)
228 ## datRowMakVc <- make.names(datRowVc, unique = TRUE)
229 ## if(datRowMakVc != datRowVc) {
230 ## rownames(datMN) <- datRowMakVc
231 ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format\n")
232 ## }
233
234 ## datColVc <- colnames(datMN)
235 ## datColMakVc <- make.names(datColVc, unique = TRUE)
236 ## if(datColMakVc != datColVc) {
237 ## colnames(datMN) <- datColMakVc
238 ## cat("\n\nMessage: The variable names of the dataMatrix have been converted to the standard R format\n")
239 ## }
240
241 ## samRowVc <- rownames(datMN)
242 ## samRowMakVc <- make.names(samRowVc, unique = TRUE)
243 ## if(samRowMakVc != samRowVc) {
244 ## rownames(datMN) <- samRowMakVc
245 ## cat("\n\nMessage: The sample names of the sampleMetadata have been converted to the standard R format\n")
246 ## }
247
248 ## datRowVc <- rownames(datMN)
249 ## datRowMakVc <- make.names(datRowVc, unique = TRUE)
250 ## if(datRowMakVc != datRowVc) {
251 ## rownames(datMN) <- datRowMakVc
252 ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format\n")
253 ## }
254
255 ## }
256
257 ## checking names (optional)
258
259
260
261
262 ## datRowMakVc <- make.names(datRowVc, unique = TRUE)
263 ## if(datRowMakVc != datRowVc) {
264 ## if(makNamL) {
265 ## rownames(datMN) <- datRowMakVc
266 ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format; select the make names argument to convert them\n")
267 ## } else {
268 ## cat("\n\nWarning: Some of the sample names of the dataMatrix are not in the standard R format; select the make names argument to convert them\n")
269 ## }
270
271 ## if(makNamL) {
272
273 ## rownames(datMN) <- make.names(rownames(datMN), unique = TRUE)
274 ## colnames(datMN) <- make.names(colnames(datMN), unique = TRUE)
275 ## rownames(samDF) <- make.names(rownames(samDF), unique = TRUE)
276 ## rownames(varDF) <- make.names(rownames(varDF), unique = TRUE)
277
278 ## }
103 ## checking sample names 279 ## checking sample names
104 280
105 chkL <- FALSE 281 ## if(nrow(datMN) == nrow(samDF)) {
106 282
107 datSamDifVc <- setdiff(rownames(datMN), rownames(samDF)) 283
108 284
109 if(length(datSamDifVc)) { 285 ## }
110 cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="") 286
111 print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))), 287 ## chkL <- FALSE
112 name = datSamDifVc)) 288
113 } 289 ## datSamDifVc <- setdiff(rownames(datMN), rownames(samDF))
114 290
115 samDatDifVc <- setdiff(rownames(samDF), rownames(datMN)) 291 ## if(length(datSamDifVc)) {
116 292 ## cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="")
117 if(length(samDatDifVc)) { 293 ## print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))),
118 cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="") 294 ## name = datSamDifVc))
119 print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))), 295 ## }
120 name = samDatDifVc)) 296
121 } 297 ## samDatDifVc <- setdiff(rownames(samDF), rownames(datMN))
122 298
123 if(nrow(datMN) != nrow(samDF)) { 299 ## if(length(samDatDifVc)) {
124 cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="") 300 ## cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="")
125 } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) { 301 ## print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))),
126 cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="") 302 ## name = samDatDifVc))
127 } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) { 303 ## }
128 cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="") 304
129 } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { 305 ## if(nrow(datMN) != nrow(samDF)) {
130 cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="") 306 ## cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="")
131 print(cbind.data.frame(indice = 1:nrow(datMN), 307 ## } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) {
132 dataMatrix_columnnames=rownames(datMN), 308 ## cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="")
133 sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) 309 ## } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) {
134 } else { 310 ## cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="")
135 cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="") 311 ## } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) {
136 print(cbind.data.frame(indice = 1:nrow(datMN), 312 ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="")
137 dataMatrix_columnnames=rownames(datMN), 313 ## print(cbind.data.frame(indice = 1:nrow(datMN),
138 sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) 314 ## dataMatrix_columnnames=rownames(datMN),
139 } 315 ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
140 316 ## } else {
141 } 317 ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="")
142 318 ## print(cbind.data.frame(indice = 1:nrow(datMN),
143 if(!identical(colnames(datMN), rownames(varDF))) { 319 ## dataMatrix_columnnames=rownames(datMN),
144 ## checking variable names 320 ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE])
145 321 ## }
146 chkL <- FALSE 322
147 323 ## }
148 datVarDifVc <- setdiff(colnames(datMN), rownames(varDF)) 324
149 325
150 if(length(datVarDifVc)) { 326 ## if(!identical(colnames(datMN), rownames(varDF))) {
151 cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="") 327 ## ## checking variable names
152 print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))), 328
153 name = datVarDifVc)) 329 ## chkL <- FALSE
154 330
155 } 331 ## datVarDifVc <- setdiff(colnames(datMN), rownames(varDF))
156 332
157 varDatDifVc <- setdiff(rownames(varDF), colnames(datMN)) 333 ## if(length(datVarDifVc)) {
158 334 ## cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="")
159 if(length(varDatDifVc)) { 335 ## print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))),
160 cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="") 336 ## name = datVarDifVc))
161 print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))), 337
162 name = varDatDifVc)) 338 ## }
163 } 339
164 340 ## varDatDifVc <- setdiff(rownames(varDF), colnames(datMN))
165 if(ncol(datMN) != nrow(varDF)) { 341
166 cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="") 342 ## if(length(varDatDifVc)) {
167 } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) { 343 ## cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="")
168 cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="") 344 ## print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))),
169 print(cbind.data.frame(row = 1:ncol(datMN), 345 ## name = varDatDifVc))
170 dataMatrix_rownames=colnames(datMN), 346 ## }
171 variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) 347
172 } else { 348 ## if(ncol(datMN) != nrow(varDF)) {
173 cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="") 349 ## cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="")
174 print(cbind.data.frame(row = 1:ncol(datMN), 350 ## } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) {
175 dataMatrix_rownames=colnames(datMN), 351 ## cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="")
176 variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) 352 ## print(cbind.data.frame(row = 1:ncol(datMN),
177 } 353 ## dataMatrix_rownames=colnames(datMN),
178 } 354 ## variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
179 355 ## } else {
180 options(stringsAsFactors=optStrAsFacL) 356 ## cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="")
181 357 ## print(cbind.data.frame(row = 1:ncol(datMN),
182 resLs <- list(chkL=chkL) 358 ## dataMatrix_rownames=colnames(datMN),
183 359 ## variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE])
184 return(resLs) 360 ## }
185 361 ## }
186 } ## end of checkAndReadF 362 ## checkF <- function(datInpMN,
363 ## samInpDF,
364 ## varInpDF) {
365
366 ## mode(datInpMN) == "numeric" &&
367 ## identical(rownames(datInpMN), rownames(samInpDF)) &&
368 ## identical(colnames(datInpMN), rownames(varInpDF))
369
370
371 ## }