Mercurial > repos > ethevenot > checkformat
comparison checkformat_script.R @ 3:80a38d36f946 draft
planemo upload for repository https://github.com/workflow4metabolomics/checkformat.git commit 5cf3f6eb62c1396ade1b068a3dd3cc2e3f827e15
author | ethevenot |
---|---|
date | Thu, 11 Jan 2018 10:24:56 -0500 |
parents | e194eec8e70c |
children | 9590fac86f63 |
comparison
equal
deleted
inserted
replaced
2:b6a6b4cc932a | 3:80a38d36f946 |
---|---|
1 ## Etienne Thevenot | 1 ## Etienne Thevenot |
2 ## CEA, MetaboHUB Paris | 2 ## CEA, MetaboHUB Paris |
3 ## etienne.thevenot@cea.fr | 3 ## etienne.thevenot@cea.fr |
4 | |
4 | 5 |
5 | 6 |
6 ## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files | 7 ## Reads the dataMatrix, sampleMetadata, and variableMetadata .tsv files |
7 ## and checks the formats | 8 ## and checks the formats |
8 readAndCheckF <- function(datFilC="dataMatrix.tsv", | 9 readAndCheckF <- function(datFilC="dataMatrix.tsv", |
9 samFilC="sampleMetadata.tsv", | 10 samFilC="sampleMetadata.tsv", |
10 varFilC="variableMetadata.tsv") { | 11 varFilC="variableMetadata.tsv", |
12 makNamL) { | |
13 | |
11 | 14 |
12 ## options | 15 ## options |
13 | 16 |
14 optStrAsFacL <- options()[["stringsAsFactors"]] | 17 optStrAsFacL <- options()[["stringsAsFactors"]] |
15 options(stringsAsFactors = FALSE) | 18 options(stringsAsFactors = FALSE) |
19 | |
16 | 20 |
17 ## checking that the tables have no duplicated row or column names | 21 ## checking that the tables have no duplicated row or column names |
18 | 22 |
19 for(tabC in c("dat", "sam", "var")) { | 23 for(tabC in c("dat", "sam", "var")) { |
20 | 24 |
42 tabNamC, | 46 tabNamC, |
43 " table: '", | 47 " table: '", |
44 paste(colVc[duplicated(colVc)], collapse="', '"), "'", | 48 paste(colVc[duplicated(colVc)], collapse="', '"), "'", |
45 call.=FALSE) | 49 call.=FALSE) |
46 | 50 |
47 rowMakVc <- make.names(rowVc, unique = TRUE) | 51 } |
48 | 52 |
49 rowDifVl <- rowVc != rowMakVc | |
50 | |
51 if(any(rowDifVl)) { | |
52 rowDifDF <- data.frame(row = 1:length(rowVc), | |
53 actual = rowVc, | |
54 preferred = rowMakVc) | |
55 rowDifDF <- rowDifDF[rowDifVl, , drop = FALSE] | |
56 cat("\n\nWarning: The following row names of the ", | |
57 tabNamC, | |
58 " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="") | |
59 print(rowDifDF) | |
60 } | |
61 | |
62 colMakVc <- make.names(colVc, unique = TRUE) | |
63 | |
64 colDifVl <- colVc != colMakVc | |
65 | |
66 if(any(colDifVl)) { | |
67 colDifDF <- data.frame(col = 1:length(colVc), | |
68 actual = colVc, | |
69 preferred = colMakVc) | |
70 colDifDF <- colDifDF[colDifVl, , drop = FALSE] | |
71 cat("\n\nWarning: The following column names of the ", | |
72 tabNamC, | |
73 " table are not in the standard R format, which may result in errors when loading the data in some of the W4M modules:\n", sep="") | |
74 print(colDifDF) | |
75 } | |
76 } | |
77 | 53 |
78 ## reading tables | 54 ## reading tables |
79 | 55 |
80 datMN <- t(as.matrix(read.table(datFilC, | 56 datMN <- t(as.matrix(read.table(datFilC, |
81 check.names = FALSE, | 57 check.names = FALSE, |
92 varDF <- read.table(varFilC, | 68 varDF <- read.table(varFilC, |
93 check.names = FALSE, | 69 check.names = FALSE, |
94 header = TRUE, | 70 header = TRUE, |
95 row.names = 1, | 71 row.names = 1, |
96 sep = "\t") | 72 sep = "\t") |
97 | 73 |
98 ## checking formats | 74 |
75 ## checking that dataMatrix is numeric and that the sample and variable numbers are coherent | |
76 | |
77 if(mode(datMN) != "numeric") { | |
78 stop("The dataMatrix is not of the 'numeric' type", | |
79 call. = FALSE) | |
80 } | |
81 | |
82 if(nrow(datMN) != nrow(samDF)) { | |
83 if(nrow(datMN) > nrow(samDF)) { | |
84 print(setdiff(rownames(datMN), rownames(samDF))) | |
85 stop("The sample names above from dataMatrix were not found in sampleMetadata") | |
86 } else { | |
87 print(setdiff(rownames(samDF), rownames(datMN))) | |
88 stop("The sample names above from sampleMetadata were not found in dataMatrix") | |
89 } | |
90 } | |
91 | |
92 if(ncol(datMN) != nrow(varDF)) { | |
93 if(ncol(datMN) > nrow(varDF)) { | |
94 print(setdiff(colnames(datMN), rownames(varDF))) | |
95 stop("The variable names above from dataMatrix were not found in variableMetadata") | |
96 } else { | |
97 print(setdiff(rownames(varDF), colnames(datMN))) | |
98 stop("The variable names above from variableMetadata were not found in dataMatrix") | |
99 } | |
100 } | |
101 | |
102 | |
103 ## making sample and variable names (optional) | |
104 | |
105 newL <- FALSE | |
106 | |
107 if(makNamL) { | |
108 | |
109 cat("\n\nMessage: Converting sample and variable names to the standard R format\n") | |
110 | |
111 rownames(datMN) <- make.names(rownames(datMN), unique = TRUE) | |
112 colnames(datMN) <- make.names(colnames(datMN), unique = TRUE) | |
113 rownames(samDF) <- make.names(rownames(samDF), unique = TRUE) | |
114 rownames(varDF) <- make.names(rownames(varDF), unique = TRUE) | |
115 | |
116 newL <- TRUE | |
117 | |
118 } | |
119 | |
120 | |
121 ## checking sample and variable names | |
99 | 122 |
100 chkL <- TRUE | 123 chkL <- TRUE |
101 | 124 |
102 if(!identical(rownames(datMN), rownames(samDF))) { | 125 if(!identical(rownames(datMN), rownames(samDF))) { |
126 | |
127 if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { | |
128 | |
129 cat("\n\nMessage: Re-ordering dataMatrix sample names to match sampleMetadata\n") | |
130 datMN <- datMN[rownames(samDF), , drop = FALSE] | |
131 | |
132 stopifnot(identical(sort(rownames(datMN)), sort(rownames(samDF)))) | |
133 | |
134 newL <- TRUE | |
135 | |
136 } else { | |
137 | |
138 cat("\n\nStop: The sample names of dataMatrix and sampleMetadata do not match:\n") | |
139 print(cbind.data.frame(indice = 1:nrow(datMN), | |
140 dataMatrix=rownames(datMN), | |
141 sampleMetadata=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) | |
142 chkL <- FALSE | |
143 | |
144 } | |
145 | |
146 } | |
147 | |
148 if(!identical(colnames(datMN), rownames(varDF))) { | |
149 | |
150 if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) { | |
151 | |
152 cat("\n\nMessage: Re-ordering dataMatrix variable names to match variableMetadata\n") | |
153 datMN <- datMN[, rownames(varDF), drop = FALSE] | |
154 | |
155 stopifnot(identical(sort(colnames(datMN)), sort(rownames(varDF)))) | |
156 | |
157 newL <- TRUE | |
158 | |
159 } else { | |
160 | |
161 cat("\n\nStop: The variable names of dataMatrix and variableMetadata do not match:\n") | |
162 print(cbind.data.frame(indice = 1:ncol(datMN), | |
163 dataMatrix=colnames(datMN), | |
164 variableMetadata=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) | |
165 chkL <- FALSE | |
166 | |
167 } | |
168 | |
169 } | |
170 | |
171 | |
172 options(stringsAsFactors=optStrAsFacL) | |
173 | |
174 resLs <- list(chkL=chkL, | |
175 newL = newL, | |
176 datMN = datMN, | |
177 samDF = samDF, | |
178 varDF = varDF) | |
179 | |
180 return(resLs) | |
181 | |
182 } ## end of checkAndReadF | |
183 | |
184 | |
185 | |
186 | |
187 ## if(!identical(rownames(datMN), rownames(samDF))) { | |
188 ## ## checking sample names | |
189 | |
190 ## chkL <- FALSE | |
191 | |
192 ## datSamDifVc <- setdiff(rownames(datMN), rownames(samDF)) | |
193 | |
194 ## if(length(datSamDifVc)) { | |
195 ## cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="") | |
196 ## print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))), | |
197 ## name = datSamDifVc)) | |
198 ## } | |
199 | |
200 ## samDatDifVc <- setdiff(rownames(samDF), rownames(datMN)) | |
201 | |
202 ## if(length(samDatDifVc)) { | |
203 ## cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="") | |
204 ## print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))), | |
205 ## name = samDatDifVc)) | |
206 ## } | |
207 | |
208 ## if(nrow(datMN) != nrow(samDF)) { | |
209 ## cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="") | |
210 ## } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) { | |
211 ## cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="") | |
212 ## } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) { | |
213 ## cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="") | |
214 ## } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { | |
215 ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="") | |
216 ## print(cbind.data.frame(indice = 1:nrow(datMN), | |
217 ## dataMatrix_columnnames=rownames(datMN), | |
218 ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) | |
219 ## } else { | |
220 ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="") | |
221 ## print(cbind.data.frame(indice = 1:nrow(datMN), | |
222 ## dataMatrix_columnnames=rownames(datMN), | |
223 ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) | |
224 ## } | |
225 | |
226 ## } | |
227 ## datRowVc <- rownames(datMN) | |
228 ## datRowMakVc <- make.names(datRowVc, unique = TRUE) | |
229 ## if(datRowMakVc != datRowVc) { | |
230 ## rownames(datMN) <- datRowMakVc | |
231 ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format\n") | |
232 ## } | |
233 | |
234 ## datColVc <- colnames(datMN) | |
235 ## datColMakVc <- make.names(datColVc, unique = TRUE) | |
236 ## if(datColMakVc != datColVc) { | |
237 ## colnames(datMN) <- datColMakVc | |
238 ## cat("\n\nMessage: The variable names of the dataMatrix have been converted to the standard R format\n") | |
239 ## } | |
240 | |
241 ## samRowVc <- rownames(datMN) | |
242 ## samRowMakVc <- make.names(samRowVc, unique = TRUE) | |
243 ## if(samRowMakVc != samRowVc) { | |
244 ## rownames(datMN) <- samRowMakVc | |
245 ## cat("\n\nMessage: The sample names of the sampleMetadata have been converted to the standard R format\n") | |
246 ## } | |
247 | |
248 ## datRowVc <- rownames(datMN) | |
249 ## datRowMakVc <- make.names(datRowVc, unique = TRUE) | |
250 ## if(datRowMakVc != datRowVc) { | |
251 ## rownames(datMN) <- datRowMakVc | |
252 ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format\n") | |
253 ## } | |
254 | |
255 ## } | |
256 | |
257 ## checking names (optional) | |
258 | |
259 | |
260 | |
261 | |
262 ## datRowMakVc <- make.names(datRowVc, unique = TRUE) | |
263 ## if(datRowMakVc != datRowVc) { | |
264 ## if(makNamL) { | |
265 ## rownames(datMN) <- datRowMakVc | |
266 ## cat("\n\nMessage: The sample names of the dataMatrix have been converted to the standard R format; select the make names argument to convert them\n") | |
267 ## } else { | |
268 ## cat("\n\nWarning: Some of the sample names of the dataMatrix are not in the standard R format; select the make names argument to convert them\n") | |
269 ## } | |
270 | |
271 ## if(makNamL) { | |
272 | |
273 ## rownames(datMN) <- make.names(rownames(datMN), unique = TRUE) | |
274 ## colnames(datMN) <- make.names(colnames(datMN), unique = TRUE) | |
275 ## rownames(samDF) <- make.names(rownames(samDF), unique = TRUE) | |
276 ## rownames(varDF) <- make.names(rownames(varDF), unique = TRUE) | |
277 | |
278 ## } | |
103 ## checking sample names | 279 ## checking sample names |
104 | 280 |
105 chkL <- FALSE | 281 ## if(nrow(datMN) == nrow(samDF)) { |
106 | 282 |
107 datSamDifVc <- setdiff(rownames(datMN), rownames(samDF)) | 283 |
108 | 284 |
109 if(length(datSamDifVc)) { | 285 ## } |
110 cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="") | 286 |
111 print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))), | 287 ## chkL <- FALSE |
112 name = datSamDifVc)) | 288 |
113 } | 289 ## datSamDifVc <- setdiff(rownames(datMN), rownames(samDF)) |
114 | 290 |
115 samDatDifVc <- setdiff(rownames(samDF), rownames(datMN)) | 291 ## if(length(datSamDifVc)) { |
116 | 292 ## cat("\nThe following samples were found in the dataMatrix column names but not in the sampleMetadata row names:\n", sep="") |
117 if(length(samDatDifVc)) { | 293 ## print(cbind.data.frame(col = as.numeric(sapply(datSamDifVc, function(samC) which(rownames(datMN) == samC))), |
118 cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="") | 294 ## name = datSamDifVc)) |
119 print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))), | 295 ## } |
120 name = samDatDifVc)) | 296 |
121 } | 297 ## samDatDifVc <- setdiff(rownames(samDF), rownames(datMN)) |
122 | 298 |
123 if(nrow(datMN) != nrow(samDF)) { | 299 ## if(length(samDatDifVc)) { |
124 cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="") | 300 ## cat("\n\nThe following samples were found in the sampleMetadata row names but not in the dataMatrix column names:\n", sep="") |
125 } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) { | 301 ## print(cbind.data.frame(row = as.numeric(sapply(samDatDifVc, function(samC) which(rownames(samDF) == samC))), |
126 cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="") | 302 ## name = samDatDifVc)) |
127 } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) { | 303 ## } |
128 cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="") | 304 |
129 } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { | 305 ## if(nrow(datMN) != nrow(samDF)) { |
130 cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="") | 306 ## cat("\n\nThe dataMatrix has ", nrow(datMN), " columns (ie samples) whereas the sampleMetadata has ", nrow(samDF), " rows\n", sep="") |
131 print(cbind.data.frame(indice = 1:nrow(datMN), | 307 ## } else if(identical(gsub("^X", "", rownames(datMN)), rownames(samDF))) { |
132 dataMatrix_columnnames=rownames(datMN), | 308 ## cat("\n\nThe dataMatrix column names start with an 'X' but not the sampleMetadata row names\n", sep="") |
133 sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) | 309 ## } else if(identical(gsub("^X", "", rownames(samDF)), rownames(datMN))) { |
134 } else { | 310 ## cat("\n\nThe sampleMetadata row names start with an 'X' but not the dataMatrix column names\n", sep="") |
135 cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="") | 311 ## } else if(identical(sort(rownames(datMN)), sort(rownames(samDF)))) { |
136 print(cbind.data.frame(indice = 1:nrow(datMN), | 312 ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not in the same order:\n", sep="") |
137 dataMatrix_columnnames=rownames(datMN), | 313 ## print(cbind.data.frame(indice = 1:nrow(datMN), |
138 sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) | 314 ## dataMatrix_columnnames=rownames(datMN), |
139 } | 315 ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) |
140 | 316 ## } else { |
141 } | 317 ## cat("\n\nThe dataMatrix column names and the sampleMetadata row names are not identical:\n", sep="") |
142 | 318 ## print(cbind.data.frame(indice = 1:nrow(datMN), |
143 if(!identical(colnames(datMN), rownames(varDF))) { | 319 ## dataMatrix_columnnames=rownames(datMN), |
144 ## checking variable names | 320 ## sampleMetadata_rownames=rownames(samDF))[rownames(datMN) != rownames(samDF), , drop = FALSE]) |
145 | 321 ## } |
146 chkL <- FALSE | 322 |
147 | 323 ## } |
148 datVarDifVc <- setdiff(colnames(datMN), rownames(varDF)) | 324 |
149 | 325 |
150 if(length(datVarDifVc)) { | 326 ## if(!identical(colnames(datMN), rownames(varDF))) { |
151 cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="") | 327 ## ## checking variable names |
152 print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))), | 328 |
153 name = datVarDifVc)) | 329 ## chkL <- FALSE |
154 | 330 |
155 } | 331 ## datVarDifVc <- setdiff(colnames(datMN), rownames(varDF)) |
156 | 332 |
157 varDatDifVc <- setdiff(rownames(varDF), colnames(datMN)) | 333 ## if(length(datVarDifVc)) { |
158 | 334 ## cat("\nThe following variables were found in the dataMatrix row names but not in the variableMetadata row names:\n", sep="") |
159 if(length(varDatDifVc)) { | 335 ## print(cbind.data.frame(row = as.numeric(sapply(datVarDifVc, function(varC) which(colnames(datMN) == varC))), |
160 cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="") | 336 ## name = datVarDifVc)) |
161 print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))), | 337 |
162 name = varDatDifVc)) | 338 ## } |
163 } | 339 |
164 | 340 ## varDatDifVc <- setdiff(rownames(varDF), colnames(datMN)) |
165 if(ncol(datMN) != nrow(varDF)) { | 341 |
166 cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="") | 342 ## if(length(varDatDifVc)) { |
167 } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) { | 343 ## cat("\n\nThe following variables were found in the variableMetadata row names but not in the dataMatrix row names:\n", sep="") |
168 cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="") | 344 ## print(cbind.data.frame(row = as.numeric(sapply(varDatDifVc, function(varC) which(rownames(varDF) == varC))), |
169 print(cbind.data.frame(row = 1:ncol(datMN), | 345 ## name = varDatDifVc)) |
170 dataMatrix_rownames=colnames(datMN), | 346 ## } |
171 variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) | 347 |
172 } else { | 348 ## if(ncol(datMN) != nrow(varDF)) { |
173 cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="") | 349 ## cat("\n\nThe dataMatrix has ", nrow(datMN), " rows (ie variables) whereas the variableMetadata has ", nrow(varDF), " rows\n", sep="") |
174 print(cbind.data.frame(row = 1:ncol(datMN), | 350 ## } else if(identical(sort(colnames(datMN)), sort(rownames(varDF)))) { |
175 dataMatrix_rownames=colnames(datMN), | 351 ## cat("\n\nThe dataMatrix row names and the variableMetadata row names are not in the same order:\n", sep="") |
176 variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) | 352 ## print(cbind.data.frame(row = 1:ncol(datMN), |
177 } | 353 ## dataMatrix_rownames=colnames(datMN), |
178 } | 354 ## variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) |
179 | 355 ## } else { |
180 options(stringsAsFactors=optStrAsFacL) | 356 ## cat("\n\nThe dataMatrix row names and the variableMetadata row names are not identical:\n", sep="") |
181 | 357 ## print(cbind.data.frame(row = 1:ncol(datMN), |
182 resLs <- list(chkL=chkL) | 358 ## dataMatrix_rownames=colnames(datMN), |
183 | 359 ## variableMetadata_rownames=rownames(varDF))[colnames(datMN) != rownames(varDF), , drop = FALSE]) |
184 return(resLs) | 360 ## } |
185 | 361 ## } |
186 } ## end of checkAndReadF | 362 ## checkF <- function(datInpMN, |
363 ## samInpDF, | |
364 ## varInpDF) { | |
365 | |
366 ## mode(datInpMN) == "numeric" && | |
367 ## identical(rownames(datInpMN), rownames(samInpDF)) && | |
368 ## identical(colnames(datInpMN), rownames(varInpDF)) | |
369 | |
370 | |
371 ## } |