Mercurial > repos > george-weingart > maaslin
comparison maaslin-4450aa4ecc84/src/lib/IO.R @ 1:a87d5a5f2776
Uploaded the version running on the prod server
author | george-weingart |
---|---|
date | Sun, 08 Feb 2015 23:08:38 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:e0b5980139d9 | 1:a87d5a5f2776 |
---|---|
1 ##################################################################################### | |
2 #Copyright (C) <2012> | |
3 # | |
4 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
5 #this software and associated documentation files (the "Software"), to deal in the | |
6 #Software without restriction, including without limitation the rights to use, copy, | |
7 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
8 #and to permit persons to whom the Software is furnished to do so, subject to | |
9 #the following conditions: | |
10 # | |
11 #The above copyright notice and this permission notice shall be included in all copies | |
12 #or substantial portions of the Software. | |
13 # | |
14 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
15 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
16 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
17 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
18 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
19 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
20 # | |
21 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), | |
22 # authored by the Huttenhower lab at the Harvard School of Public Health | |
23 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). | |
24 ##################################################################################### | |
25 | |
26 inlinedocs <- function( | |
27 ##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu> | |
28 ##description<< Collection of functions centered on custom reading of data and some IO services. | |
29 ) { return( pArgs ) } | |
30 | |
31 #Project Constants | |
32 | |
33 c_astrNA <- c(""," "," ","NA","na") | |
34 | |
35 #Do not report warnings | |
36 options(warn=-1) | |
37 | |
38 funcWriteMatrixToReadConfigFile = function( | |
39 ### Writes a read config file. Will write over a file by default | |
40 strConfigureFileName, | |
41 ### Matrix that will be read | |
42 strMatrixName, | |
43 ### Name of matrix that will be read | |
44 strRowIndices=NA, | |
45 ### Rows which will be Read (TSV) by default all will be read | |
46 strColIndices=NA, | |
47 ### Cols which will be Read (TSV) by default all will be read | |
48 acharDelimiter=c_strDefaultMatrixDelimiter, | |
49 ### Delimiter for the matrix that will be read in\ | |
50 fAppend=FALSE | |
51 ### Append to a current read config file | |
52 ){ | |
53 #If no append delete previous file | |
54 if(!fAppend){unlink(strConfigureFileName)} | |
55 | |
56 #Make delimiter readable | |
57 switch(acharDelimiter, | |
58 "\t" = {acharDelimiter = "TAB"}, | |
59 " " = {acharDelimiter = "SPACE"}, | |
60 "\r" = {acharDelimiter = "RETURN"}, | |
61 "\n" = {acharDelimiter = "ENDLINE"}) | |
62 | |
63 #Manage NAs | |
64 if(is.na(strRowIndices)){strRowIndices="-"} | |
65 if(is.na(strColIndices)){strColIndices="-"} | |
66 | |
67 #Required output | |
68 lsDataLines = c(paste(c_MATRIX_NAME,strMatrixName,sep=" "), | |
69 paste(c_DELIMITER,acharDelimiter,sep=" "), | |
70 paste(c_ID_ROW,"1",sep=" "), | |
71 paste(c_ID_COLUMN,"1",sep=" "), | |
72 paste(c_TSVROWS,strRowIndices,sep=" "), | |
73 paste(c_TSVCOLUMNS,strColIndices,sep=" ")) | |
74 | |
75 lsDataLines = c(lsDataLines,"\n") | |
76 | |
77 #Output to file | |
78 lapply(lsDataLines, cat, file=strConfigureFileName, sep="\n", append=TRUE) | |
79 } | |
80 | |
81 funcWriteMatrices = function( | |
82 ### Write data frame data files with config files | |
83 dataFrameList, | |
84 ### A named list of data frames (what you get directly from the read function) | |
85 saveFileList, | |
86 ### File names to save the data matrices in (one name per data frame) | |
87 configureFileName, | |
88 ### Name of the configure file to be written which will direct the reading of these data | |
89 acharDelimiter=c_strDefaultMatrixDelimiter, | |
90 ### Matrix delimiter | |
91 log = FALSE | |
92 ### Indicates if logging should occur | |
93 ){ | |
94 #Get names | |
95 dataFrameNames = names(dataFrameList) | |
96 | |
97 #Get length of dataFrameList | |
98 dataFrameListLength = length(dataFrameList) | |
99 | |
100 #Get length of save file list | |
101 saveFileListLength = length(saveFileList) | |
102 | |
103 #If the save file list length and data frame list length are not equal, abort | |
104 if(!saveFileListLength == dataFrameListLength) | |
105 {stop(paste("Received a length of save files (",saveFileListLength,") that are different from the count of data frames (",dataFrameListLength,"). Stopped and returned false."),sep="")} | |
106 | |
107 #Delete the old config file | |
108 unlink(configureFileName) | |
109 | |
110 #For each data save | |
111 for (dataIndex in c(1:dataFrameListLength)) | |
112 { | |
113 #Current data frame | |
114 data = dataFrameList[[dataIndex]] | |
115 | |
116 #Get column count | |
117 columnCount = ncol(data) | |
118 | |
119 #Get row and column names | |
120 rowNames = row.names(data) | |
121 rowNamesString = paste(rowNames,sep="",collapse=",") | |
122 if(length(rowNamesString)==0){rowNamesString = NA} | |
123 | |
124 columnNamesString = paste(colnames(data),sep="",collapse=",") | |
125 if(length(columnNamesString)==0){columnNamesString = NA} | |
126 | |
127 #Get row indices | |
128 rowStart = 1 | |
129 if(!is.na(rowNamesString)){rowStart = 2} | |
130 rowEnd = nrow(data)+rowStart - 1 | |
131 rowIndices = paste(c(rowStart:rowEnd),sep="",collapse=",") | |
132 | |
133 #Get col indices | |
134 colStart = 1 | |
135 if(!is.na(columnNamesString)){ colStart = 2} | |
136 colEnd = columnCount+colStart - 1 | |
137 colIndices = paste(c(colStart:colEnd),sep="",collapse=",") | |
138 | |
139 #Write Data to file | |
140 write.table(data, saveFileList[dataIndex], quote = FALSE, sep = acharDelimiter, col.names = NA, row.names = rowNames, na = "NA", append = FALSE) | |
141 | |
142 #Write the read config file | |
143 funcWriteMatrixToReadConfigFile(strConfigureFileName=configureFileName, strMatrixName=dataFrameNames[dataIndex], | |
144 strRowIndices=rowIndices, strColIndices=colIndices, acharDelimiter=acharDelimiter, fAppend=TRUE) | |
145 } | |
146 return(TRUE) | |
147 } | |
148 | |
149 funcReadMatrices = function( | |
150 ### Dynamically Read a Matrix/Matrices from a configure file | |
151 configureFile, | |
152 ### Read config file to guide reading in data | |
153 defaultFile = NA, | |
154 ### Default data file to read | |
155 log = FALSE | |
156 ){ | |
157 #Named vector to return data frames read | |
158 returnFrames = list() | |
159 #Holds the names of the frames as they are being added | |
160 returnFrameNames = c() | |
161 returnFramesIndex = 1 | |
162 | |
163 #Read in config file info | |
164 #Read each data block extracted from the config file | |
165 lsDataBlocks <- funcReadConfigFile(configureFile, defaultFile) | |
166 if(!length(lsDataBlocks)) { | |
167 astrMetadata <- NULL | |
168 astrMetadata[2] <- defaultFile | |
169 astrMetadata[5] <- "2" | |
170 astrData <- NULL | |
171 astrData[2] <- defaultFile | |
172 astrData[5] <- "3-" | |
173 lsDataBlocks <- list(astrMetadata, astrData) | |
174 } | |
175 for(dataBlock in lsDataBlocks) | |
176 { | |
177 #Read in matrix | |
178 returnFrames[[returnFramesIndex]] = funcReadMatrix(tempMatrixName=dataBlock[1], tempFileName=dataBlock[2], tempDelimiter=dataBlock[3], tempColumns=dataBlock[5], tempRows=dataBlock[4], tempLog=log) | |
179 returnFrameNames = c(returnFrameNames,dataBlock[1]) | |
180 returnFramesIndex = returnFramesIndex + 1 | |
181 } | |
182 names(returnFrames) = returnFrameNames | |
183 return(returnFrames) | |
184 } | |
185 | |
186 funcReadMatrix = function( | |
187 ### Read one matrix | |
188 ### The name to give the block of data read in from file | |
189 tempMatrixName, | |
190 ### ID rows and columns are assumed to be 1 | |
191 tempFileName=NA, | |
192 ### Data file to read | |
193 tempDelimiter=NA, | |
194 ### Data matrix delimiter | |
195 tempColumns=NA, | |
196 ### Data columns to read | |
197 tempRows=NA, | |
198 ### Data rows to read | |
199 tempLog=FALSE | |
200 ### Indicator to log | |
201 ){ | |
202 if(is.na(tempDelimiter)){tempDelimiter <- c_strDefaultMatrixDelimiter} | |
203 if(is.na(tempColumns)){tempColumns <- c_strDefaultReadCols} | |
204 if(is.na(tempRows)){tempRows <- c_strDefaultReadRows} | |
205 | |
206 #Check parameter and make sure not NA | |
207 if(is.na(tempMatrixName)){tempMatrixName <- ""} | |
208 if(!funcIsValid(tempMatrixName)){stop(paste("Did not receive a valid matrix name, received ",tempMatrixName,"."))} | |
209 | |
210 #Check to make sure there is a file name for the matrix | |
211 if(! funcIsValidFileName(tempFileName)) | |
212 {stop(paste("No valid file name is given for the matrix ",tempMatrixName," from file: ",tempFileName,". Please add a valid file name to read the matrix from.", sep=""))} | |
213 | |
214 #Read in superset matrix and give names if indicated | |
215 #Read in matrix | |
216 dataMatrix = read.table(tempFileName, sep = tempDelimiter, as.is = TRUE, na.strings=c_astrNA, quote = "", comment.char = "") | |
217 dataFrameDimension = dim(dataMatrix) | |
218 | |
219 #Get column names | |
220 columnNameList = as.matrix(dataMatrix[1,]) | |
221 rowNameList = dataMatrix[1][[1]] | |
222 | |
223 #Convert characters to vectors of indices | |
224 tempColumns = funcParseIndexSlices(ifelse(is.na(tempColumns),"-",tempColumns), columnNameList) | |
225 tempRows = funcParseIndexSlices(ifelse(is.na(tempRows),"-", tempRows), rowNameList) | |
226 | |
227 #Check indices | |
228 #Check to make sure valid id col/rows and data col/rows | |
229 if((!funcIsValid(tempColumns)) || (!funcIsValid(tempRows))) | |
230 {stop(paste("Received invalid row or col. Rows=",tempRows," Cols=", tempColumns))} | |
231 | |
232 #Check to make sure only 1 row id is given and it is not repeated in the data rows | |
233 if(length(intersect(1,tempColumns)) == 1) | |
234 {stop(paste("Index indicated as an id row but was found in the data row indices, can not be both. Index=1 Data indices=",tempColumns,sep=""))} | |
235 | |
236 #Check to make sure only one col id is given and it is not repeated in the data columns | |
237 #Id row/col should not be in data row/col | |
238 if(length(intersect(1, tempRows)) == 1) | |
239 {stop(paste("Index indicated as an id column but was found in the data column indices, can not be both. ID Index=1 Data Indices=", tempRows,".",sep=""))} | |
240 | |
241 #If the row names have the same length as the column count and has column names | |
242 #it is assumed that the tempIdCol index item is associated with the column names. | |
243 #Visa versa for rows, either way it is removed | |
244 #Remove ids from name vector | |
245 rowNameList = rowNameList[(-1)] | |
246 #Remove ids from data | |
247 dataMatrix = dataMatrix[(-1)] | |
248 #Adjust row ids given the removal of the id row | |
249 tempColumns=(tempColumns-1) | |
250 | |
251 ## Remove id rows/columns and set row/col names | |
252 #Remove ids from vector | |
253 columnNameList = columnNameList[(-1)] | |
254 #Remove ids from data | |
255 dataMatrix = dataMatrix[(-1),] | |
256 #Adjust column ids given the removal of the id column | |
257 tempRows =(tempRows-1) | |
258 #Add row and column names | |
259 row.names(dataMatrix) = as.character(rowNameList) | |
260 colnames(dataMatrix) = as.character(columnNameList) | |
261 | |
262 #Reduce matrix | |
263 #Account for when both column ranges and row ranges are given or just a column or a row range is given | |
264 dataMatrix = dataMatrix[tempRows, tempColumns, drop=FALSE] | |
265 | |
266 #Set all columns data types to R guessed default | |
267 for(i in 1:ncol(dataMatrix)){ | |
268 dataMatrix[,i] <- type.convert(dataMatrix[,i], na.strings = c_astrNA)} | |
269 | |
270 #Return matrix | |
271 return(dataMatrix) | |
272 } | |
273 | |
274 funcReadConfigFile = function( | |
275 ### Reads in configure file and extracts the pieces needed for reading in a matrix | |
276 configureFile, | |
277 ### Configure file = string path to configure file | |
278 defaultFile = NA | |
279 ### Used to set a default data file | |
280 ){ | |
281 #Read configure file | |
282 fileDataList <- list() | |
283 if(!is.null( configureFile ) ) { | |
284 fileDataList <- scan( file = configureFile, what = character(), sep="\n", quiet=TRUE) } | |
285 newList = list() | |
286 for(sLine in fileDataList) | |
287 { | |
288 sLine = gsub("\\s","",sLine) | |
289 vUnits = unlist(strsplit(sLine,":")) | |
290 if(length(vUnits)>1) | |
291 { | |
292 vUnits[1] = paste(vUnits[1],":",sep="") | |
293 newList[[length(newList)+1]] = vUnits | |
294 } | |
295 } | |
296 fileDataList = unlist(newList) | |
297 | |
298 matrixName <- NA | |
299 fileName <- defaultFile | |
300 | |
301 #Hold information on matrices to be read | |
302 matrixInformationList = list() | |
303 matrixInformationListCount = 1 | |
304 | |
305 for(textIndex in c(1:length(fileDataList))) | |
306 { | |
307 if(textIndex > length(fileDataList)) {break} | |
308 #Start at the Matrix name | |
309 #Keep this if statement first so that you scan through until you find a matrix block | |
310 if(fileDataList[textIndex] == c_MATRIX_NAME) | |
311 { | |
312 #If the file name is not NA then that is sufficient for a matrix, store | |
313 #Either way reset | |
314 if(funcIsValid(fileName)&&funcIsValid(matrixName)) | |
315 { | |
316 matrixInformationList[[matrixInformationListCount]] = c(matrixName,fileName,delimiter,rows,columns) | |
317 matrixInformationListCount = matrixInformationListCount + 1 | |
318 } | |
319 | |
320 #Get the matrix name and store | |
321 matrixName = fileDataList[textIndex + 1] | |
322 | |
323 fileName = defaultFile | |
324 delimiter = "\t" | |
325 rows = NA | |
326 columns = NA | |
327 #If is not matrix name and no matrix name is known skip until you find the matrix name | |
328 #If matrix name is known, continue to collect information about that matrix | |
329 } else if(is.na(matrixName)){next} | |
330 | |
331 #Parse different keywords | |
332 strParseKey = fileDataList[textIndex] | |
333 if(strParseKey == c_FILE_NAME){fileName=fileDataList[textIndex+1]} | |
334 else if(strParseKey==c_FILE_NAME){fileName=fileDataList[textIndex+1]} | |
335 else if(strParseKey %in% c(c_TSVROWS,c_PCLCOLUMNS,c_ROWS)){rows=fileDataList[textIndex+1]} | |
336 else if(strParseKey %in% c(c_TSVCOLUMNS,c_PCLROWS,c_COLUMNS)){columns=fileDataList[textIndex+1]} | |
337 else if(strParseKey==c_DELIMITER) | |
338 { | |
339 switch(fileDataList[textIndex + 1], | |
340 "TAB" = {delimiter = "\t"}, | |
341 "SPACE" = {delimiter = " "}, | |
342 "RETURN" = {delimiter = "\r"}, | |
343 "ENDLINE" = {delimiter = "\n"}) | |
344 } | |
345 } | |
346 #If there is matrix information left | |
347 if((!is.na(matrixName)) && (!is.na(fileName))) | |
348 { | |
349 matrixInformationList[[matrixInformationListCount]] = c(matrixName,fileName,delimiter,rows,columns) | |
350 matrixInformationListCount = matrixInformationListCount + 1 | |
351 } | |
352 return(matrixInformationList) | |
353 } | |
354 | |
355 funcParseIndexSlices = function( | |
356 ### Take a string of comma or dash seperated integer strings and convert into a vector | |
357 ### of integers to use in index slicing | |
358 strIndexString, | |
359 ### String to be parsed into indicies vector | |
360 cstrNames | |
361 ### Column names of the data so names can be resolved to indicies | |
362 ){ | |
363 #If the slices are NA then return | |
364 if(is.na(strIndexString)){return(strIndexString)} | |
365 | |
366 #List of indices to return | |
367 viRetIndicies = c() | |
368 | |
369 #Split on commas | |
370 lIndexString = sapply(strsplit(strIndexString, c_COMMA),function(x) return(x)) | |
371 for(strIndexItem in lIndexString) | |
372 { | |
373 #Handle the - case | |
374 if(strIndexItem=="-"){strIndexItem = paste("2-",length(cstrNames),sep="")} | |
375 | |
376 #Split on dash and make sure it makes sense | |
377 lItemElement = strsplit(strIndexItem, c_DASH)[[1]] | |
378 if(length(lItemElement)>2){stop("Error in index, too many dashes, only one is allowed. Index = ",strIndexItem,sep="")} | |
379 | |
380 #Switch names to numbers | |
381 aiIndices = which(is.na(as.numeric(lItemElement))) | |
382 for( iIndex in aiIndices ) | |
383 { | |
384 lItemElement[iIndex] = which(cstrNames==lItemElement[iIndex])[1] | |
385 } | |
386 | |
387 #Make numeric | |
388 liItemElement = unlist(lapply(lItemElement, as.numeric)) | |
389 | |
390 #If dash is at the end or the beginning add on the correct number | |
391 if(substr(strIndexItem,1,1)==c_DASH){liItemElement[1]=2} | |
392 if(substr(strIndexItem,nchar(strIndexItem),nchar(strIndexItem))==c_DASH){liItemElement[2]=length(cstrNames)} | |
393 | |
394 #If multiple numbers turn to a slice | |
395 if(length(liItemElement)==2){liItemElement = c(liItemElement[1]:liItemElement[2])} | |
396 | |
397 #Update indices | |
398 viRetIndicies = c(viRetIndicies, liItemElement) | |
399 } | |
400 if(length(viRetIndicies)==0){return(NA)} | |
401 return(sort(unique(viRetIndicies))) | |
402 ### Sorted indicies vector | |
403 } |