Mercurial > repos > george-weingart > maaslin

diff src/lib/IO.R @ 0:e0b5980139d9
maaslin
author: george-weingart
date: Tue, 13 May 2014 22:00:40 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/IO.R	Tue May 13 22:00:40 2014 -0400
@@ -0,0 +1,403 @@
+#####################################################################################
+#Copyright (C) <2012>
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy of
+#this software and associated documentation files (the "Software"), to deal in the
+#Software without restriction, including without limitation the rights to use, copy,
+#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+#and to permit persons to whom the Software is furnished to do so, subject to
+#the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all copies
+#or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), 
+# authored by the Huttenhower lab at the Harvard School of Public Health
+# (contact Timothy Tickle, ttickle@hsph.harvard.edu).
+#####################################################################################
+
+inlinedocs <- function(
+##author<< Curtis Huttenhower <chuttenh@hsph.harvard.edu> and Timothy Tickle <ttickle@hsph.harvard.edu>
+##description<< Collection of functions centered on custom reading of data and some IO services.
+) { return( pArgs ) }
+
+#Project Constants
+
+c_astrNA <- c(""," ","  ","NA","na")
+
+#Do not report warnings
+options(warn=-1)
+
+funcWriteMatrixToReadConfigFile = function(
+### Writes a read config file. Will write over a file by default
+strConfigureFileName,
+### Matrix that will be read
+strMatrixName,
+### Name of matrix that will be read
+strRowIndices=NA,
+### Rows which will be Read (TSV) by default all will be read
+strColIndices=NA,
+### Cols which will be Read (TSV) by default all will be read
+acharDelimiter=c_strDefaultMatrixDelimiter,
+### Delimiter for the matrix that will be read in\
+fAppend=FALSE
+### Append to a current read config file
+){
+  #If no append delete previous file
+  if(!fAppend){unlink(strConfigureFileName)}
+
+  #Make delimiter readable
+  switch(acharDelimiter,
+    "\t" = {acharDelimiter = "TAB"},
+    " " = {acharDelimiter = "SPACE"},
+    "\r" = {acharDelimiter = "RETURN"},
+    "\n" = {acharDelimiter = "ENDLINE"})
+    
+  #Manage NAs
+  if(is.na(strRowIndices)){strRowIndices="-"}
+  if(is.na(strColIndices)){strColIndices="-"}
+
+  #Required output
+  lsDataLines = c(paste(c_MATRIX_NAME,strMatrixName,sep=" "),
+    paste(c_DELIMITER,acharDelimiter,sep=" "),
+    paste(c_ID_ROW,"1",sep=" "),
+    paste(c_ID_COLUMN,"1",sep=" "),
+    paste(c_TSVROWS,strRowIndices,sep=" "),
+    paste(c_TSVCOLUMNS,strColIndices,sep=" "))
+
+  lsDataLines = c(lsDataLines,"\n")
+
+  #Output to file
+  lapply(lsDataLines, cat, file=strConfigureFileName, sep="\n", append=TRUE)
+}
+
+funcWriteMatrices = function(
+### Write data frame data files with config files
+dataFrameList,
+### A named list of data frames (what you get directly from the read function)
+saveFileList,
+### File names to save the data matrices in (one name per data frame)
+configureFileName,
+### Name of the configure file to be written which will direct the reading of these data
+acharDelimiter=c_strDefaultMatrixDelimiter,
+### Matrix delimiter
+log = FALSE
+### Indicates if logging should occur
+){
+  #Get names
+  dataFrameNames = names(dataFrameList)
+
+  #Get length of dataFrameList
+  dataFrameListLength = length(dataFrameList)
+
+  #Get length of save file list
+  saveFileListLength = length(saveFileList)
+
+  #If the save file list length and data frame list length are not equal, abort
+  if(!saveFileListLength == dataFrameListLength)
+  {stop(paste("Received a length of save files (",saveFileListLength,") that are different from the count of data frames (",dataFrameListLength,"). Stopped and returned false."),sep="")}
+
+  #Delete the old config file
+  unlink(configureFileName)
+
+  #For each data save
+  for (dataIndex in c(1:dataFrameListLength))
+  {
+    #Current data frame
+    data = dataFrameList[[dataIndex]]
+
+    #Get column count
+    columnCount = ncol(data)
+
+    #Get row and column names
+    rowNames = row.names(data)
+    rowNamesString = paste(rowNames,sep="",collapse=",")
+    if(length(rowNamesString)==0){rowNamesString = NA}
+
+    columnNamesString = paste(colnames(data),sep="",collapse=",")
+    if(length(columnNamesString)==0){columnNamesString = NA}
+
+    #Get row indices
+    rowStart = 1
+    if(!is.na(rowNamesString)){rowStart = 2}
+    rowEnd = nrow(data)+rowStart - 1
+    rowIndices = paste(c(rowStart:rowEnd),sep="",collapse=",")
+
+    #Get col indices
+    colStart = 1
+    if(!is.na(columnNamesString)){  colStart = 2}
+    colEnd = columnCount+colStart - 1
+    colIndices = paste(c(colStart:colEnd),sep="",collapse=",")
+
+    #Write Data to file
+    write.table(data, saveFileList[dataIndex], quote = FALSE, sep = acharDelimiter, col.names = NA, row.names = rowNames, na = "NA", append = FALSE)
+
+    #Write the read config file
+    funcWriteMatrixToReadConfigFile(strConfigureFileName=configureFileName, strMatrixName=dataFrameNames[dataIndex],
+      strRowIndices=rowIndices, strColIndices=colIndices, acharDelimiter=acharDelimiter, fAppend=TRUE)
+  }
+  return(TRUE)
+}
+
+funcReadMatrices = function(
+### Dynamically Read a Matrix/Matrices from a configure file
+configureFile,
+### Read config file to guide reading in data
+defaultFile = NA,
+### Default data file to read
+log = FALSE
+){
+  #Named vector to return data frames read
+  returnFrames = list()
+  #Holds the names of the frames as they are being added
+  returnFrameNames = c()
+  returnFramesIndex = 1
+
+  #Read in config file info
+  #Read each data block extracted from the config file
+  lsDataBlocks <- funcReadConfigFile(configureFile, defaultFile)
+  if(!length(lsDataBlocks)) {
+	  astrMetadata <- NULL
+	  astrMetadata[2] <- defaultFile
+	  astrMetadata[5] <- "2"
+	  astrData <- NULL
+	  astrData[2] <- defaultFile
+	  astrData[5] <- "3-"
+	  lsDataBlocks <- list(astrMetadata, astrData)
+  }
+  for(dataBlock in lsDataBlocks)
+  {
+    #Read in matrix
+    returnFrames[[returnFramesIndex]] = funcReadMatrix(tempMatrixName=dataBlock[1], tempFileName=dataBlock[2], tempDelimiter=dataBlock[3], tempColumns=dataBlock[5], tempRows=dataBlock[4], tempLog=log)
+    returnFrameNames = c(returnFrameNames,dataBlock[1])
+    returnFramesIndex = returnFramesIndex + 1
+  }
+  names(returnFrames) = returnFrameNames
+  return(returnFrames)
+}
+
+funcReadMatrix = function(
+### Read one matrix
+### The name to give the block of data read in from file
+tempMatrixName,
+### ID rows and columns are assumed to be 1
+tempFileName=NA,
+### Data file to read
+tempDelimiter=NA,
+### Data matrix delimiter
+tempColumns=NA,
+### Data columns to read
+tempRows=NA,
+### Data rows to read
+tempLog=FALSE
+### Indicator to log
+){
+  if(is.na(tempDelimiter)){tempDelimiter <- c_strDefaultMatrixDelimiter}
+  if(is.na(tempColumns)){tempColumns <- c_strDefaultReadCols}
+  if(is.na(tempRows)){tempRows <- c_strDefaultReadRows}
+  
+  #Check parameter and make sure not NA
+  if(is.na(tempMatrixName)){tempMatrixName <- ""}
+  if(!funcIsValid(tempMatrixName)){stop(paste("Did not receive a valid matrix name, received ",tempMatrixName,"."))}
+
+  #Check to make sure there is a file name for the matrix
+  if(! funcIsValidFileName(tempFileName))
+  {stop(paste("No valid file name is given for the matrix ",tempMatrixName," from file: ",tempFileName,". Please add a valid file name to read the matrix from.", sep=""))}
+
+  #Read in superset matrix and give names if indicated 
+  #Read in matrix
+  dataMatrix = read.table(tempFileName, sep = tempDelimiter, as.is = TRUE, na.strings=c_astrNA, quote = "", comment.char = "")
+  dataFrameDimension = dim(dataMatrix)
+
+  #Get column names
+  columnNameList = as.matrix(dataMatrix[1,])
+  rowNameList = dataMatrix[1][[1]]
+
+  #Convert characters to vectors of indices
+  tempColumns = funcParseIndexSlices(ifelse(is.na(tempColumns),"-",tempColumns), columnNameList)
+  tempRows = funcParseIndexSlices(ifelse(is.na(tempRows),"-", tempRows), rowNameList)
+
+  #Check indices
+  #Check to make sure valid id col/rows and data col/rows
+  if((!funcIsValid(tempColumns)) || (!funcIsValid(tempRows)))
+  {stop(paste("Received invalid row or col. Rows=",tempRows," Cols=", tempColumns))}
+
+  #Check to make sure only 1 row id is given and it is not repeated in the data rows
+  if(length(intersect(1,tempColumns)) == 1)
+  {stop(paste("Index indicated as an id row but was found in the data row indices, can not be both. Index=1 Data indices=",tempColumns,sep=""))}
+
+  #Check to make sure only one col id is given and it is not repeated in the data columns
+  #Id row/col should not be in data row/col
+  if(length(intersect(1, tempRows)) == 1)
+  {stop(paste("Index indicated as an id column but was found in the data column indices, can not be both. ID Index=1 Data Indices=", tempRows,".",sep=""))}
+
+  #If the row names have the same length as the column count and has column names 
+  #it is assumed that the tempIdCol index item is associated with the column names.
+  #Visa versa for rows, either way it is removed
+  #Remove ids from name vector
+  rowNameList = rowNameList[(-1)]
+  #Remove ids from data
+  dataMatrix = dataMatrix[(-1)]
+  #Adjust row ids given the removal of the id row
+  tempColumns=(tempColumns-1)
+
+  ## Remove id rows/columns and set row/col names
+  #Remove ids from vector
+  columnNameList = columnNameList[(-1)]
+  #Remove ids from data
+  dataMatrix = dataMatrix[(-1),]
+  #Adjust column ids given the removal of the id column
+  tempRows =(tempRows-1)
+  #Add row and column names
+  row.names(dataMatrix) = as.character(rowNameList)
+  colnames(dataMatrix) = as.character(columnNameList)
+
+  #Reduce matrix
+  #Account for when both column ranges and row ranges are given or just a column or a row range is given
+  dataMatrix = dataMatrix[tempRows, tempColumns, drop=FALSE]
+
+  #Set all columns data types to R guessed default
+  for(i in 1:ncol(dataMatrix)){
+    dataMatrix[,i] <- type.convert(dataMatrix[,i], na.strings = c_astrNA)}
+
+  #Return matrix
+  return(dataMatrix)
+}
+
+funcReadConfigFile = function(
+### Reads in configure file and extracts the pieces needed for reading in a matrix
+configureFile,
+### Configure file = string path to configure file
+defaultFile = NA
+### Used to set a default data file
+){
+  #Read configure file
+  fileDataList <- list()
+  if(!is.null( configureFile ) ) {
+    fileDataList <- scan( file = configureFile, what = character(), sep="\n", quiet=TRUE) }
+  newList = list()
+  for(sLine in fileDataList)
+  {
+    sLine = gsub("\\s","",sLine)
+    vUnits = unlist(strsplit(sLine,":"))
+    if(length(vUnits)>1)
+    {
+      vUnits[1] = paste(vUnits[1],":",sep="")
+      newList[[length(newList)+1]] = vUnits
+    }
+  }
+  fileDataList = unlist(newList)
+
+  matrixName <- NA
+  fileName <- defaultFile
+
+  #Hold information on matrices to be read
+  matrixInformationList = list()
+  matrixInformationListCount = 1
+
+  for(textIndex in c(1:length(fileDataList)))
+  {
+    if(textIndex > length(fileDataList)) {break}
+    #Start at the Matrix name
+    #Keep this if statement first so that you scan through until you find a matrix block
+    if(fileDataList[textIndex] == c_MATRIX_NAME)
+    {
+      #If the file name is not NA then that is sufficient for a matrix, store
+      #Either way reset
+      if(funcIsValid(fileName)&&funcIsValid(matrixName))
+      {
+        matrixInformationList[[matrixInformationListCount]] = c(matrixName,fileName,delimiter,rows,columns)
+        matrixInformationListCount = matrixInformationListCount + 1
+      }
+
+      #Get the matrix name and store
+      matrixName = fileDataList[textIndex + 1]
+
+      fileName = defaultFile
+      delimiter = "\t"
+      rows = NA
+      columns = NA
+      #If is not matrix name and no matrix name is known skip until you find the matrix name
+      #If matrix name is known, continue to collect information about that matrix
+    } else if(is.na(matrixName)){next}
+
+    #Parse different keywords
+    strParseKey = fileDataList[textIndex]
+    if(strParseKey == c_FILE_NAME){fileName=fileDataList[textIndex+1]}
+    else if(strParseKey==c_FILE_NAME){fileName=fileDataList[textIndex+1]}
+    else if(strParseKey %in% c(c_TSVROWS,c_PCLCOLUMNS,c_ROWS)){rows=fileDataList[textIndex+1]}
+    else if(strParseKey %in% c(c_TSVCOLUMNS,c_PCLROWS,c_COLUMNS)){columns=fileDataList[textIndex+1]}
+    else if(strParseKey==c_DELIMITER)
+    {
+        switch(fileDataList[textIndex + 1],
+        "TAB" = {delimiter = "\t"},
+        "SPACE" = {delimiter = " "},
+        "RETURN" = {delimiter = "\r"},
+        "ENDLINE" = {delimiter = "\n"})
+    }
+  }
+  #If there is matrix information left
+  if((!is.na(matrixName)) && (!is.na(fileName)))
+  {
+    matrixInformationList[[matrixInformationListCount]] = c(matrixName,fileName,delimiter,rows,columns)
+    matrixInformationListCount = matrixInformationListCount + 1
+  }
+  return(matrixInformationList)
+}
+
+funcParseIndexSlices = function(
+### Take a string of comma or dash seperated integer strings and convert into a vector
+### of integers to use in index slicing
+strIndexString,
+### String to be parsed into indicies vector
+cstrNames
+### Column names of the data so names can be resolved to indicies
+){
+  #If the slices are NA then return
+  if(is.na(strIndexString)){return(strIndexString)}
+
+  #List of indices to return
+  viRetIndicies = c()
+
+  #Split on commas
+  lIndexString = sapply(strsplit(strIndexString, c_COMMA),function(x) return(x))
+  for(strIndexItem in lIndexString)
+  {
+    #Handle the - case
+    if(strIndexItem=="-"){strIndexItem = paste("2-",length(cstrNames),sep="")}
+
+    #Split on dash and make sure it makes sense
+    lItemElement = strsplit(strIndexItem, c_DASH)[[1]]
+    if(length(lItemElement)>2){stop("Error in index, too many dashes, only one is allowed. Index = ",strIndexItem,sep="")}
+
+    #Switch names to numbers
+    aiIndices = which(is.na(as.numeric(lItemElement)))
+    for( iIndex in aiIndices )
+    {
+      lItemElement[iIndex] = which(cstrNames==lItemElement[iIndex])[1]
+    }
+
+    #Make numeric
+    liItemElement = unlist(lapply(lItemElement, as.numeric))
+
+    #If dash is at the end or the beginning add on the correct number
+    if(substr(strIndexItem,1,1)==c_DASH){liItemElement[1]=2}
+    if(substr(strIndexItem,nchar(strIndexItem),nchar(strIndexItem))==c_DASH){liItemElement[2]=length(cstrNames)}
+
+    #If multiple numbers turn to a slice
+    if(length(liItemElement)==2){liItemElement = c(liItemElement[1]:liItemElement[2])}
+
+    #Update indices
+    viRetIndicies = c(viRetIndicies, liItemElement)
+  }
+  if(length(viRetIndicies)==0){return(NA)}
+  return(sort(unique(viRetIndicies)))
+  ### Sorted indicies vector
+}
author	george-weingart
date	Tue, 13 May 2014 22:00:40 -0400
parents
children