Mercurial > repos > george-weingart > micropita
view src/breadcrumbs/src/AbundanceTable.py @ 0:2f4f6f08c8c4 draft
Uploaded
author | george-weingart |
---|---|
date | Tue, 13 May 2014 21:58:57 -0400 |
parents | |
children | cdef6996e3f3 |
line wrap: on
line source
""" Author: Timothy Tickle Description: Class to abstract an abundance table and methods to run on such a table. """ ##################################################################################### #Copyright (C) <2012> # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal in the #Software without restriction, including without limitation the rights to use, copy, #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, #and to permit persons to whom the Software is furnished to do so, subject to #the following conditions: # #The above copyright notice and this permission notice shall be included in all copies #or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ##################################################################################### __author__ = "Timothy Tickle" __copyright__ = "Copyright 2012" __credits__ = ["Timothy Tickle"] __license__ = "MIT" __maintainer__ = "Timothy Tickle" __email__ = "ttickle@sph.harvard.edu" __status__ = "Development" import csv import sys import blist from CClade import CClade from ConstantsBreadCrumbs import ConstantsBreadCrumbs import copy from datetime import date import numpy as np import os import re import scipy.stats import string from ValidateData import ValidateData from biom.parse import * from biom.table import * c_dTarget = 1.0 c_fRound = False c_iSumAllCladeLevels = -1 c_fOutputLeavesOnly = False class RowMetadata: """ Holds the row (feature) metadata and associated functions. """ def __init__(self, dictRowMetadata, iLongestMetadataEntry=None, lsRowMetadataIDs=None): """ Constructor requires a dictionary or row metadata. :param dictRowMetadata: The row metadata values with the ids as the keys, must be stable (keep order) :type: {string feature id: {'metadata': {'taxonomy': [list of metadata values]}}} """ self.dictRowMetadata = dictRowMetadata self.iLongestMetadataEntry = iLongestMetadataEntry self.lsRowMetadataIDs = lsRowMetadataIDs self.dictMetadataIDs = {} # Get the ids for the metadata if self.dictRowMetadata: for dictMetadata in self.dictRowMetadata.values(): dictMetadata = dictMetadata.get(ConstantsBreadCrumbs.c_metadata_lowercase, None) if dictMetadata: for key,value in dictMetadata.items(): if self.dictMetadataIDs.get(key, None): self.dictMetadataIDs[key] = max(self.dictMetadataIDs[key],len(dictMetadata[key])) else: self.dictMetadataIDs[key] = len(dictMetadata[key]) def funcMakeIDs(self): """ There should be a one to one mapping of row metadata ids and the values associated here with a feature ID. If not make ids from the key by appending numbers. """ # If there exists a ids list already return (this allows ID order to be given and preserved) # Else make a list of IDs if self.lsRowMetadataIDs: return self.lsRowMetadataIDs lsIDs = [] lsKeys = [] for key, value in self.dictMetadataIDs.items(): lsKeys.append( key ) if value > 1: lsIDs.extend( [ "_".join( [ key, str( iIndex ) ] ) for iIndex in xrange( value ) ] ) else: lsIDs.append( key ) return [ lsIDs, lsKeys ] def funGetFeatureMetadata(self, sFeature, sMetadata): """ Returns a list of values in the order of row metadta ids for a microbial feature given an id. :param sFeature: Feature id to get metadata :type: string :param sMetadata: Metadata to get :type: string :return: list of metadata associated with the metadata """ lsMetadata = [] if self.dictRowMetadata: dictFeature = self.dictRowMetadata.get( sFeature, None ) if dictFeature: dictFeatureMetadata = dictFeature.get(ConstantsBreadCrumbs.c_metadata_lowercase, None) if dictFeatureMetadata: lsMetadata = dictFeatureMetadata.get(sMetadata, None) return lsMetadata class AbundanceTable: """ Represents an abundance table and contains common function to perform on the object. This class is made from an abundance data file. What is expected is a text file delimited by a character (which is given to the object). The first column is expected to be the id column for each of the rows. Metadata is expected before measurement data. Columns are samples and rows are features (bugs). This object is currently not hashable. """ def __init__(self, npaAbundance, dictMetadata, strName, strLastMetadata, rwmtRowMetadata = None, dictFileMetadata = None, lOccurenceFilter = None, cFileDelimiter = ConstantsBreadCrumbs.c_cTab, cFeatureNameDelimiter = "|"): """ Constructor for an abundance table. :param npaAbundance: Structured Array of abundance data (Row=Features, Columns=Samples) :type: Numpy Structured Array abundance data (Row=Features, Columns=Samples) :param dictMetadata: Dictionary of metadata {"String ID":["strValue","strValue","strValue","strValue","strValue"]} :type: Dictionary Dictionary :param npaRowMetdata Structured Array of row (feature) metadata (optional) :type: Numpy Structured Array abundance data (Row=Features, Columns=Feature metadata) :param strName: The name of the metadata that serves as the ID for the columns (For example a sample ID) :type: string :param strLastMetadata: The string last metadata name :type: string :param lOccurenceFilter: List of integers used in an occurence filter. [Min abundance, Min sample] :type: List of integers :param cFileDelimiter: Character used as the delimiter of the file that is read in to create the abundance table. Will also be used to write the abudance table file to a file to keep file consistency. :type: Character delimiter for reading the data in (default = TAB) :param cFeatureNameDelimiter: Character used as the delimiter of the feature names (column 1). This is useful if the name are complex, for instance consensus lineages in metagenomics. :type: Character delimiter for feature names (default = |) """ ### File Metadata #Date self.dateCreationDate = dictFileMetadata.get(ConstantsBreadCrumbs.c_strDateKey,None) if dictFileMetadata else None #Indicates if the table has been filtered and how self._strCurrentFilterState = "" #The delimiter from the source file self._cDelimiter = cFileDelimiter #The feature name delimiter self._cFeatureDelimiter = cFeatureNameDelimiter #File type self.strFileFormatType = dictFileMetadata.get(ConstantsBreadCrumbs.c_strFormatKey,None) if dictFileMetadata else None #File generation source self.strFileGenerationSource = dictFileMetadata.get(ConstantsBreadCrumbs.c_strSourceKey,None) if dictFileMetadata else None #File type self.strFileType = dictFileMetadata.get(ConstantsBreadCrumbs.c_strTypekey,None) if dictFileMetadata else None #File url self.strFileURL = dictFileMetadata.get(ConstantsBreadCrumbs.c_strURLKey,None) if dictFileMetadata else None #The id of the file self.strId = dictFileMetadata.get(ConstantsBreadCrumbs.c_strIDKey,None) if dictFileMetadata else None #The lastmetadata name (which should be preserved when writing the file) # Can be a None if biom file is read in. self._strLastMetadataName = strLastMetadata #The original number of features in the table self._iOriginalFeatureCount = -1 #The name of the object relating to the file it was read from or would have been read from if it exists #Keeps tract of changes to the file through the name #Will be used to write out the object to a file as needed self._strOriginalName = strName #The original number of samples in the table self._iOriginalSampleCount = -1 #Data sparsity type self.fSparseMatrix = dictFileMetadata.get(ConstantsBreadCrumbs.c_strSparsityKey,False) if dictFileMetadata else False ### Data metadata #The column (sample) metdata self._dictTableMetadata = dictMetadata #The row (feature) metadata (Row Metadata object) self.rwmtRowMetadata = rwmtRowMetadata ### Data #The abundance data self._npaFeatureAbundance = npaAbundance ### Logistical #Clade prefixes for biological samples self._lsCladePrefixes = ["k__","p__","c__","o__","f__","g__","s__"] #This is not a hashable object self.__hash__ = None ### Prep the object self._fIsNormalized = self._fIsSummed = None #If contents is not a false then set contents to appropriate objects # Checking to see if the data is normalized, summed and if we need to run a filter on it. if ( self._npaFeatureAbundance != None ) and self._dictTableMetadata: self._iOriginalFeatureCount = self._npaFeatureAbundance.shape[0] self._iOriginalSampleCount = len(self.funcGetSampleNames()) self._fIsNormalized = ( max( [max( list(a)[1:] or [0] ) for a in self._npaFeatureAbundance] or [0] ) <= 1 ) lsLeaves = AbundanceTable.funcGetTerminalNodesFromList( [a[0] for a in self._npaFeatureAbundance], self._cFeatureDelimiter ) self._fIsSummed = ( len( lsLeaves ) != len( self._npaFeatureAbundance ) ) #Occurence filtering #Removes features that do not have a given level iLowestAbundance in a given amount of samples iLowestSampleOccurence if ( not self._fIsNormalized ) and lOccurenceFilter: iLowestAbundance, iLowestSampleOccurrence = lOccurenceFilter self.funcFilterAbundanceBySequenceOccurence( iLowestAbundance, iLowestSampleOccurrence ) # else: # sys.stderr.write( "Abundance or metadata was None, should be atleast an empty object\n" ) @staticmethod def funcMakeFromFile(xInputFile, cDelimiter = ConstantsBreadCrumbs.c_cTab, sMetadataID = None, sLastMetadataRow = None, sLastMetadata = None, lOccurenceFilter = None, cFeatureNameDelimiter="|", xOutputFile = None): """ Creates an abundance table from a table file. :param xInputFile: Path to input file. :type: String String path. :param cDelimiter: Delimiter for parsing the input file. :type: Character Character :param sMetadataID: String ID that is a metadata row ID (found on the first column) and used as an ID for samples :type: String String ID :param sLastRowMetadata: The id of the last (most right column) row metadata :type: String String ID :param sLastMetadata: The ID of the metadata that is the last metadata before measurement or feature rows. :type: String String ID :param lOccurenceFilter: List of integers used in an occurence filter. [Min abundance, Min sample] :type: List of integers :param cFeatureNameDelimiter: Used to parse feature (bug) names if they are complex. For example if they are consensus lineages and contain parent clade information. :type: Character Delimiting letter :param xOutputFile: File to output the abundance table which was read in. :type: FileStream or String file path :return AbundanceTable: Will return an AbundanceTable object on no error. Returns False on error. """ #Get output file and remove if existing outputFile = open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile ################################################################################# # Check if file is a biom file - if so invoke the biom routine # ################################################################################# strFileName = xInputFile if isinstance(xInputFile, str) else xInputFile.name # Determine the file read function by file extension if strFileName.endswith(ConstantsBreadCrumbs.c_strBiomFile): BiomCommonArea = AbundanceTable._funcBiomToStructuredArray(xInputFile) if BiomCommonArea: lContents = [BiomCommonArea[ConstantsBreadCrumbs.c_BiomTaxData], BiomCommonArea[ConstantsBreadCrumbs.c_Metadata], BiomCommonArea[ ConstantsBreadCrumbs.c_dRowsMetadata], BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo] ] # Update last metadata and id if given if not sLastMetadata: strLastMetadata = BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata] else: # return false on failure lContents = False else: #Read in from text file to create the abundance and metadata structures lContents = AbundanceTable._funcTextToStructuredArray(xInputFile=xInputFile, cDelimiter=cDelimiter, sMetadataID = sMetadataID, sLastMetadataRow = sLastMetadataRow, sLastMetadata = sLastMetadata, ostmOutputFile = outputFile) #If contents is not a false then set contents to appropriate objects return AbundanceTable(npaAbundance=lContents[0], dictMetadata=lContents[1], strName=str(xInputFile), strLastMetadata=sLastMetadata, rwmtRowMetadata = lContents[2], dictFileMetadata = lContents[3], lOccurenceFilter = lOccurenceFilter, cFileDelimiter=cDelimiter, cFeatureNameDelimiter=cFeatureNameDelimiter) if lContents else False #Testing Status: Light happy path testing @staticmethod def funcCheckRawDataFile(strReadDataFileName, iFirstDataIndex = -1, sLastMetadataName = None, lOccurenceFilter = None, strOutputFileName = "", cDelimiter = ConstantsBreadCrumbs.c_cTab): """ Check the input abundance table. Currently reduces the features that have no occurence. Also inserts a NA for blank metadata and a 0 for blank abundance data. Gives the option to filter features through an occurence filter (a feature must have a level of abundance in a minimal number of samples to be included). Either iFirstDataIndex or sLastMetadataName must be given :param strReadDataFileName: File path of file to read and check. :type: String File path. :param iFirstDataIndex: First (row) index of data not metadata in the abundance file. :type: Integer Index starting at 0. :param sLastMetadataName: The ID of the last metadata in the file. Rows of measurements should follow this metadata. :type: String :param lOccurenceFilter: The lowest number of occurences in the lowest number of samples needed for a feature to be kept :type: List[2] List length 2 [lowest abundance (not normalized), lowest number of samples to occur in] (eg. [2.0,2.0]) :param strOutputFileName: File path of out put file. :type: String File path. :param cDelimiter: Character delimiter for reading and writing files. :type: Character Delimiter. :return Output Path: Output path for written checked file. """ #Validate parameters if (iFirstDataIndex == -1) and (sLastMetadataName == None): print "AbundanceTable:checkRawDataFile::Error, either iFirstDataIndex or sLastMetadataNamemust be given." return False #Get output file and remove if existing outputFile = strOutputFileName if not strOutputFileName: outputFile = os.path.splitext(strReadDataFileName)[0]+ConstantsBreadCrumbs.OUTPUT_SUFFIX #Read input file lines #Drop blank lines readData = "" with open(strReadDataFileName,'rU') as f: readData = f.read() readData = filter(None,readData.split(ConstantsBreadCrumbs.c_strEndline)) #Read the length of each line and make sure there is no jagged data #Also hold row count for the metadata iLongestLength = len(readData[0].split(cDelimiter)) iMetadataRow = -1 if not sLastMetadataName: sLastMetadataName = "None" for iIndex, strLine in enumerate(readData): sLineElements = strLine.split(cDelimiter) if sLineElements[0] == sLastMetadataName: iMetadataRow = iIndex iLongestLength = max(iLongestLength, len(sLineElements)) #If not already set, set iFirstDataIndex if iFirstDataIndex < 0: iFirstDataIndex = iMetadataRow + 1 #Used to substitute . to - reSubPeriod = re.compile('\.') #File writer with open(outputFile,'w') as f: #Write metadata #Empty data is changed to a default #Jagged ends are filled with a default for strDataLine in readData[:iFirstDataIndex]: lsLineElements = strDataLine.split(cDelimiter) for iindex, sElement in enumerate(lsLineElements): if not sElement.strip(): lsLineElements[iindex] = ConstantsBreadCrumbs.c_strEmptyDataMetadata if len(lsLineElements) < iLongestLength: lsLineElements = lsLineElements + ([ConstantsBreadCrumbs.c_strEmptyDataMetadata]*(iLongestLength-len(lsLineElements))) f.write(cDelimiter.join(lsLineElements)+ConstantsBreadCrumbs.c_strEndline) #For each data line in the table for line in readData[iFirstDataIndex:]: writeToFile = False cleanLine = list() #Break line into delimited elements lineElements = line.split(cDelimiter) #Clean feature name sCleanFeatureName = reSubPeriod.sub("-",lineElements[0]) #For each element but the first (taxa name) #Element check to see if not == zero #If so add to output for element in lineElements[1:]: if(element.strip() in string.whitespace): cleanLine.append(ConstantsBreadCrumbs.c_strEmptyAbundanceData) #Set abundance of 0 but do not indicate the line should be saved elif(element == "0"): cleanLine.append(element) #If an abundance is found set the line to be saved. else: cleanLine.append(element) writeToFile = True #Occurence filtering #Removes features that do not have a given level iLowestAbundance in a given amount of samples iLowestSampleOccurence if lOccurenceFilter: iLowestAbundance, iLowestSampleOccurence = lOccurenceFilter if iLowestSampleOccurence > sum([1 if float(sEntry) >= iLowestAbundance else 0 for sEntry in cleanLine]): writeToFile = False #Write to file if writeToFile: f.write(sCleanFeatureName+cDelimiter+cDelimiter.join(cleanLine)+ConstantsBreadCrumbs.c_strEndline) return outputFile def __repr__(self): """ Represent or print object. """ return "AbundanceTable" def __str__(self): """ Create a string representation of the Abundance Table. """ return "".join(["Sample count:", str(len(self._npaFeatureAbundance.dtype.names[1:])), os.linesep+"Feature count:", str(len(self._npaFeatureAbundance[self._npaFeatureAbundance.dtype.names[0]])), os.linesep+"Id Metadata:", self._npaFeatureAbundance.dtype.names[0], os.linesep+"Metadata ids:", str(self._dictTableMetadata.keys()), os.linesep+"Metadata count:", str(len(self._dictTableMetadata.keys())), os.linesep+"Originating source:",self._strOriginalName, os.linesep+"Original feature count:", str(self._iOriginalFeatureCount), os.linesep+"Original sample count:", str(self._iOriginalSampleCount), os.linesep+"Is normalized:", str(self._fIsNormalized), os.linesep+"Is summed:", str(self._fIsSummed), os.linesep+"Current filtering state:", str(self._strCurrentFilterState), os.linesep+"Feature delimiter:", self._cFeatureDelimiter, os.linesep+"File delimiter:",self._cDelimiter]) def __eq__(self, objOther): """ Check if an object is equivalent in data to this object Check to make sure that objOther is not None Check to make sure objOther is the correct class type Check to make sure self and other internal data are the same (exclusing file name) Check data and make sure the npa arrays are the same Check the metdata to make sure the dicts are the same (will need to sort the keys of the dicts before comparing, they do not guarentee any order. """ # Check for none if objOther is None: return False #Check for object type if isinstance(objOther,AbundanceTable) != True: return False #Check feature delimiter if self.funcGetFeatureDelimiter() != objOther.funcGetFeatureDelimiter(): return False #Check file delimiter if self.funcGetFileDelimiter() != objOther.funcGetFileDelimiter(): return False #************************************************** #* Commented out * #************************************************** #Check name - Commented out by GW on 2013/09/14 because #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same #Check name #if self.funcGetName() != objOther.funcGetName(): #return False #Check sample metadata #Go through the metadata result1 = self.funcGetMetadataCopy() result2 = objOther.funcGetMetadataCopy() if sorted(result1.keys()) != sorted(result2.keys()): return False for strKey in result1.keys(): if strKey not in result2: return False if result1[strKey] != result2[strKey]: return False #TODO check the row (feature) metadata #TODO check the file metadata #Check the ID if self.funcGetFileDelimiter() != objOther.funcGetFileDelimiter(): return False #Check the date if self.dateCreationDate != objOther.dateCreationDate: return False #Check the format if self.strFileFormatType != objOther.strFileFormatType: return False #************************************************** #* Commented out * #************************************************** #Check source - Commented out by GW on 2013/09/14 because #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same #Check the source #if self.strFileGenerationSource != objOther.strFileGenerationSource: #return False #Check the type if self.strFileType != objOther.strFileType: return False #Check the URL if self.strFileURL != objOther.strFileURL: return False #Check data #TODO go through the data #TODO also check the data type result1 = self.funcGetAbundanceCopy() result2 = objOther.funcGetAbundanceCopy() if len(result1) != len(result2): return False sorted_result1 = sorted(result1, key=lambda tup: tup[0]) sorted_result2 = sorted(result2, key=lambda tup: tup[0]) if sorted_result1 != sorted_result2 : return False #************************************************** #* Commented out * #************************************************** #Check AbundanceTable.__str__(self) - Commented out by GW on 2013/09/14 because #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same #Check string representation #if AbundanceTable.__str__(self) != AbundanceTable.__str__(objOther): #return False #Check if sample ids are the same and in the same order if self.funcGetSampleNames() != objOther.funcGetSampleNames(): return False return True def __ne__(self, objOther): return not self == objOther #Testing Status: Light happy path testing #TODO: Tim change static to class methods @staticmethod def _funcTextToStructuredArray(xInputFile = None, cDelimiter = ConstantsBreadCrumbs.c_cTab, sMetadataID = None, sLastMetadataRow = None, sLastMetadata = None, ostmOutputFile = None): """ Private method Used to read in a file that is samples (column) and taxa (rows) into a structured array. :param xInputFile: File stream or path to input file. :type: String File stream or string path. :param cDelimiter: Delimiter for parsing the input file. :type: Character Character. :param sMetadataID: String ID that is a metadata row ID (found on the first column) and used as an ID for samples. If not given it is assumed to be position 0 :type: String String ID :param sLastMetadataRow: String ID that is the last row metadat id (id of the most right column with row/feature metadata) :type: String String ID :param sLastMetadata: The ID of the metadata that is the last metadata before measurement or feature rows. :type: String String ID :param ostmOutputFile: Output File to write to if needed. None does not write the file. :type: FileStream or String :return [taxData,metadata,rowmetadata]: Numpy Structured Array of abundance data and dictionary of metadata. Metadata is a dictionary as such {"ID", [value,value,values...]} Values are in the order thety are read in (and the order of the sample names). ID is the first column in each metadata row. - rowmetadata is a optional Numpy strucured array (can be None if not made) + rowmetadata is a optional RowMetadata object (can be None if not made) The rowmetadata and taxData row Ids should match - [Numpy structured Array, Dictionary, Numpy structured array] + The last dict is a collection of BIOM fielparameters when converting from a BIOM file + [Numpy structured Array, Dictionary, Numpy structured array, dict] """ # Open file from a stream or file path istmInput = open( xInputFile, 'rU' ) if isinstance(xInputFile, str) else xInputFile # Flag that when incremented will switch from metadata parsing to data parsing iFirstDataRow = -1 # Sample id row namesRow = None # Row metadata names lsRowMetadataIDs = None # Index of the last row metadata iIndexLastMetadataRow = None # Holds metadata {ID:[list of values]} metadata = dict() # Holds the data measurements [(tuple fo values)] dataMatrix = [] # Holds row metadata { sID : [ list of values ] } dictRowMetadata = {} # Positional index iIndex = -1 # File handle csvw = None # Read in files if ostmOutputFile: csvw = csv.writer( open(ostmOutputFile,'w') if isinstance(ostmOutputFile, str) else ostmOutputFile, csv.excel_tab, delimiter = cDelimiter ) # For each line in the file, and assume the tax id is the first element and the data follows for lsLineElements in csv.reader( istmInput, dialect = csv.excel_tab, delimiter = cDelimiter ): iIndex += 1 taxId, sampleReads = lsLineElements[0], lsLineElements[1:] # Read through data measurements # Process them as a list of tuples (needed for structured array) if iFirstDataRow > 0: try: # Parse the sample reads, removing row metadata and storing row metadata if it exists if lsRowMetadataIDs: # Build expected dict for row metadata dictionary {string feature id: {'metadata': {metadatakey: [list of metadata values]}}} dictFeature = dict([ [sID, [sKey]] for sID, sKey in zip( lsRowMetadataIDs, sampleReads[ 0 : iIndexLastMetadataRow ]) ]) if len( dictFeature ): dictRowMetadata[ taxId ] = { ConstantsBreadCrumbs.c_metadata_lowercase: dictFeature } dataMatrix.append(tuple([taxId] + [( float(s) if s.strip( ) else 0 ) for s in sampleReads[ iIndexLastMetadataRow: ]])) else: dataMatrix.append(tuple([taxId] + [( float(s) if s.strip( ) else 0 ) for s in sampleReads])) except ValueError: sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, non-numerical value on data row. File:" + str(xInputFile) + " Row:" + str(lsLineElements) + "\n" ) return False # Go through study measurements else: # Read in metadata values, if the entry is blank then give it the default empty metadata value. for i, s in enumerate( sampleReads ): if not s.strip( ): sampleReads[i] = ConstantsBreadCrumbs.c_strEmptyDataMetadata # If no id metadata (sample ids) is given then the first row is assumed to be the id row, otherwise look for the id for the metadata. # Add the metadata to the containing dict if ( ( not sMetadataID ) and ( iIndex == 0 ) ) or ( taxId == sMetadataID ): namesRow = lsLineElements # Remove the row metadata ids, these names are for the column ID and the samples ids if sLastMetadataRow: iIndexLastMetadataRow = lsLineElements.index(sLastMetadataRow) lsRowMetadataIDs = namesRow[ 1 : iIndexLastMetadataRow + 1 ] namesRow = [ namesRow[ 0 ] ] + namesRow[ iIndexLastMetadataRow + 1: ] # If the sample metadata dictionary already has entries then remove the row metadata info from it. if len( metadata ) and len( lsRowMetadataIDs ): for sKey, lsValues in metadata.items(): metadata[ sKey ] = lsValues[ iIndexLastMetadataRow: ] # Set the metadata without row metadata entries metadata[taxId] = sampleReads[ iIndexLastMetadataRow: ] if (lsRowMetadataIDs and len( lsRowMetadataIDs )) else sampleReads # If the last metadata was just processed switch to data processing # If the last metadata name is not given it is assumed that there is only one metadata if ( not sLastMetadata ) or ( taxId == sLastMetadata ): iFirstDataRow = iIndex + 1 # If writing out the data write back out the line read in. # This happens at the end so that the above cleaning is captured and written. if csvw: csvw.writerow( [taxId] + sampleReads ) if sLastMetadata and ( not dataMatrix ): sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, did not find the row for the last metadata ID. File:" + str(xInputFile) + " Identifier:" + sLastMetadata + "\n" ) return False # Make sure the names are found if namesRow == None: sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, did not find the row for the unique sample/column. File:" + str(xInputFile) + " Identifier:" + sMetadataID + "\n" ) return False # Now we know the longest taxId we can define the first column holding the tax id # Gross requirement of Numpy structured arrays, a = ASCII followed by max # of characters (as a string) longestTaxId = max( len(a[0]) for a in dataMatrix ) dataTypeVector = [(namesRow[0],'a' + str(longestTaxId*2))] + [(s, "f4") for s in namesRow[1:]] # Create structured array taxData = np.array(dataMatrix,dtype=np.dtype(dataTypeVector)) # Returns a none currently because the PCL file specification this originally worked on did not have feature metadata # Can be updated in the future. # [Data (structured array), column metadata (dict), row metadata (structured array), file metadata (dict)] return [taxData, metadata, RowMetadata(dictRowMetadata = dictRowMetadata, lsRowMetadataIDs = lsRowMetadataIDs), { ConstantsBreadCrumbs.c_strIDKey:ConstantsBreadCrumbs.c_strDefaultPCLID, ConstantsBreadCrumbs.c_strDateKey:str(date.today()), ConstantsBreadCrumbs.c_strFormatKey:ConstantsBreadCrumbs.c_strDefaultPCLFileFormateType, ConstantsBreadCrumbs.c_strSourceKey:ConstantsBreadCrumbs.c_strDefaultPCLGenerationSource, ConstantsBreadCrumbs.c_strTypekey:ConstantsBreadCrumbs.c_strDefaultPCLFileTpe, ConstantsBreadCrumbs.c_strURLKey:ConstantsBreadCrumbs.c_strDefaultPCLURL, ConstantsBreadCrumbs.c_strSparsityKey:ConstantsBreadCrumbs. c_fDefaultPCLSparsity}] # def funcAdd(self,abndTwo,strFileName=None): # """ # Allows one to add an abundance table to an abundance table. They both must be the same state of normalization or summation # or they will be summed or normalized if one of the two are. # # :param abndTwo: AbundanceTable object 2 # :type: AbundanceTable # :return AbudanceTable: # """ # # #Check summation and normalization # if(self.funcIsSummed() or abndTwo.funcIsSummed()): # self.funcSum() # abndTwo.funcSum() # if(self.funcIsNormalized() or abndTwo.funcIsNormalized()): # self.funcNormalize() # abndTwo.funcNormalize() # # #Normalize Feature names # #Get if the abundance tables have clades # fAbndInputHasClades = self.funcHasFeatureHierarchy() # fAbndCompareHasClades = abndTwo.funcHasFeatureHierarchy() # # if(fAbndInputHasClades or fAbndCompareHasClades): # #If feature delimiters do not match, switch # if not self.funcGetFeatureDelimiter() == abndTwo.funcGetFeatureDelimiter(): # abndTwo.funcSetFeatureDelimiter(self.funcGetFeatureDelimiter()) # # #Add prefixes if needed. # self.funcAddCladePrefixToFeatures() # abndTwo.funcAddCladePrefixToFeatures() # # #Get feature Names # lsFeatures1 = self.funcGetFeatureNames() # lsFeatures2 = abndTwo.funcGetFeatureNames() # # #Make one feature name list # lsFeaturesCombined = list(set(lsFeatures1+lsFeature2)) # # #Add samples by features (Use 0.0 for empty data features, use NA for empty metadata features) # # # #Combine metadata # dictMetadata1 = self.funcGetMetadataCopy() # dictMetadata2 = abndTwo.funcGetMetadataCopy() # # #Get first table metadata and add NA for metadata it is missing for the length of the current metadata # lsMetadataOnlyInTwo = list(set(dictMetadata2.keys())-set(dictMetadata1.keys())) # dictCombinedMetadata = dictMetadata1 # lsEmptyMetadata = ["NA" for i in xrange(self.funcGetSampleCount())] # for sKey in lsMetadataOnlyInTwo: # dictCombinedMetadata[sKey]=lsEmptyMetadata # #Add in the other metadata dictionary # lsCombinedKeys = dictCombinedMetadata.keys() # lsEmptyMetadata = ["NA" for i in xrange(abndTwo.funcGetSampleCount())] # for sKey in lsCombinedKeys(): # if sKey in dictMetadata2: # dictCombinedMetadata[sKey] = dictCombinedMetadata[sKey]+dictMetadata2[sKey] # else: # dictCombinedMetadata[sKey] = dictCombinedMetadata[sKey]+lsEmptyMetadata # # #Make Abundance table # return AbundanceTable(npaAbundance=npaAbundance, # dictMetadata = dictCombinedMetadata, # strName = strFileName if strFileName else os.path.splitext(self)[0]+"_combined_"+os.path.splitext(abndTwo)[0], # strLastMetadata = self.funcGetLastMetadataName(), # cFileDelimiter = self.funcGetFileDelimiter(), cFeatureNameDelimiter=self.funcGetFeatureDelimiter()) #TODO This does not adjust for sample ordering, needs to def funcAddDataFeature(self, lsNames, npdData): """ Adds a data or group of data to the underlying table. Names should be in the order of the data Each row is considered a feature (not sample). :param lsNames: Names of the features being added to the data of the table :type: List List of string names :param npdData: Rows of features to add to the table :type: Numpy array accessed by row. """ if ( self._npaFeatureAbundance == None ): return False # Check number of input data rows iDataRows = npdData.shape[0] if (len(lsNames) != iDataRows): print "Error:The names and the rows of data features to add must be of equal length" # Grow the array by the neccessary amount and add the new rows iTableRowCount = self.funcGetFeatureCount() iRowElementCount = self.funcGetSampleCount() self._npaFeatureAbundance.resize(iTableRowCount+iDataRows) for iIndexData in xrange(iDataRows): self._npaFeatureAbundance[iTableRowCount+iIndexData] = tuple([lsNames[iIndexData]]+list(npdData[iIndexData])) return True #TODO This does not adjust for sample ordering, needs to def funcAddMetadataFeature(self,lsNames,llsMetadata): """ Adds metadata feature to the underlying table. Names should be in the order of the lists of metadata Each internal list is considered a metadata and paired to a name """ if ( self._dictTableMetadata == None ): return False # Check number of input data rows iMetadataCount = len(llsMetadata) if (len(lsNames) != iMetadataCount): print "Error:The names and the rows of metadata features to add must be of equal length" # Add the metadata for tpleMetadata in zip(lsNames,llsMetadata): self._dictTableMetadata[tpleMetadata[0]]=tpleMetadata[1] return True #2 test Cases def funcSetFeatureDelimiter(self, cDelimiter): """ Changes the feature delimiter to the one provided. Updates the feature names. :param cDelimiter: Character feature delimiter :type: Character :return Boolean: Indicator of success or not (false) """ if ( self._npaFeatureAbundance == None ): return False cDelimiterCurrent = self.funcGetFeatureDelimiter() if ( not cDelimiter or not cDelimiterCurrent): return False #Make new feature names lsNewFeatureNames = [sFeatureName.replace(cDelimiterCurrent,cDelimiter) for sFeatureName in self.funcGetFeatureNames()] #Update new feature names to abundance table if (not self.funcGetIDMetadataName() == None): self._npaFeatureAbundance[self.funcGetIDMetadataName()] = np.array(lsNewFeatureNames) #Update delimiter self._cFeatureDelimiter = cDelimiter return True #Happy path tested def funcGetSampleNames(self): """ Returns the sample names (IDs) contained in the abundance table. :return Sample Name: A List of sample names indicated by the metadata associated with the sMetadataId given in table creation. A list of string names or empty list on error as well as no underlying table. """ return self._npaFeatureAbundance.dtype.names[1:] if ( self._npaFeatureAbundance != None ) else [] #Happy Path Tested def funcGetIDMetadataName(self): """ Returns the metadata id. :return ID: The metadata id (the sample Id). Returns none on error. """ return self._npaFeatureAbundance.dtype.names[0] if ( self._npaFeatureAbundance != None ) else None #Happy path tested def funcGetAbundanceCopy(self): """ Returns a deep copy of the abundance table. :return Numpy Structured Array: The measurement data in the Abundance table. Can use sample names to access each column of measurements. Returns none on error. """ return self._npaFeatureAbundance.copy() if ( self._npaFeatureAbundance != None ) else None #Happy path tested def funcGetAverageAbundancePerSample(self, lsTargetedFeatures): """ Averages feature abundance within a sample. :param lsTargetedFeatures: String names of features to average :type: List of string names of features which are measured :return List: List of lists or boolean (False on error). One internal list per sample indicating the sample and the feature's average abudance [[sample,average abundance of selected taxa]] or False on error """ #Sample rank averages [[sample,average abundance of selected taxa]] sampleAbundanceAverages = [] sampleNames = self.funcGetSampleNames() allTaxaNames = self.funcGetFeatureNames() #Get an abundance table compressed to features of interest abndReducedTable = self.funcGetFeatureAbundanceTable(lsTargetedFeatures) if abndReducedTable == None: return False #If the taxa to be selected are not in the list, Return nothing and log lsMissing = [] for sFeature in lsTargetedFeatures: if not sFeature in allTaxaNames: lsMissing.append(sFeature) else: #Check to make sure the taxa of interest is not average abundance of 0 if not abndReducedTable.funcGetFeatureSumAcrossSamples(sFeature): lsMissing.append(sFeature) if len(lsMissing) > 0: sys.stderr.write( "Could not find features for averaging: " + str(lsMissing) ) return False #For each sample name get average abundance for sName in sampleNames: npaFeaturesSample = abndReducedTable.funcGetSample(sName) sampleAbundanceAverages.append([sName,sum(npaFeaturesSample)/float(len(npaFeaturesSample))]) #Sort based on average return sorted(sampleAbundanceAverages, key = lambda sampleData: sampleData[1], reverse = True) #Happy path tested 1 def funcGetAverageSample(self): """ Returns the average sample of the abundance table. This average sample is made of the average of each feature. :return list: A list of averages in the order of the feature names. """ ldAverageSample = [] #If there are no samples then return empty list. if len(self.funcGetSampleNames()) < 1: return ldAverageSample #If there are samples return the average of each feature in the order of the feature names. for sFeature in self._npaFeatureAbundance: npFeaturesAbundance = list(sFeature)[1:] ldAverageSample.append(sum(npFeaturesAbundance)/float(len(npFeaturesAbundance))) return ldAverageSample #Tested 2 cases def funcHasFeatureHierarchy(self): """ Returns an indicator of having a hierarchy in the features indicated by the existance of the feature delimiter. :return Boolean: True (Has a hierarchy) or False (Does not have a hierarchy) """ if ( self._npaFeatureAbundance == None ): return None cDelimiter = self.funcGetFeatureDelimiter() if ( not cDelimiter ): return False #For each feature name, check to see if the delimiter is in the name for sFeature in self.funcGetFeatureNames(): if cDelimiter in sFeature: return True return False def funcGetCladePrefixes(self): """ Returns the list of prefixes to use on biological sample hierarchy :return List: List of strings """ return self._lsCladePrefixes #3 test cases def funcAddCladePrefixToFeatures(self): """ As a standardized clade prefix to indicate biological clade given hierarchy. Will not add a prefix to already prefixes feature names. Will add prefix to feature names that do not have them or clades in a feature name that do not have them while leaving ones that do as is. :return Boolean: True (Has a hierarchy) or False (Does not have a hierarchy) """ if ( self._npaFeatureAbundance == None ): return None cDelimiter = self.funcGetFeatureDelimiter() lsPrefixes = self.funcGetCladePrefixes() iPrefixLength = len(lsPrefixes) if ( not cDelimiter ): return False #Append prefixes to feature names lsUpdatedFeatureNames = [] lsFeatureNames = self.funcGetFeatureNames() for sFeatureName in lsFeatureNames: lsClades = sFeatureName.split(cDelimiter) #If there are not enough then error if(len(lsClades) > iPrefixLength): print "Error:: Too many clades given to be biologically meaningful" return False lsUpdatedFeatureNames.append(cDelimiter.join([lsPrefixes[iClade]+lsClades[iClade] if not(lsClades[iClade][0:len(lsPrefixes[iClade])]==lsPrefixes[iClade]) else lsClades[iClade] for iClade in xrange(len(lsClades))])) #Update new feature names to abundance table if not self.funcGetIDMetadataName() == None: self._npaFeatureAbundance[self.funcGetIDMetadataName()] = np.array(lsUpdatedFeatureNames) return True #Happy Path Tested def funcGetFeatureAbundanceTable(self, lsFeatures): """ Returns a copy of the current abundance table with the abundance of just the given features. :param lsFeatures: String Feature IDs that are kept in the compressed abundance table. :type: List of strings Feature IDs (found as the first entry of a filter in the input file. :return AbundanceTable: A compressed version of the abundance table. On an error None is returned. """ if ( self._npaFeatureAbundance == None ) or ( lsFeatures == None ): return None #Get a list of boolean indicators that the row is from the features list lfFeatureData = [sRowID in lsFeatures for sRowID in self.funcGetFeatureNames()] #compressed version as an Abundance table lsNamePieces = os.path.splitext(self._strOriginalName) abndFeature = AbundanceTable(npaAbundance=np.compress(lfFeatureData, self._npaFeatureAbundance, axis = 0), dictMetadata = self.funcGetMetadataCopy(), strName = lsNamePieces[0] + "-" + str(len(lsFeatures)) +"-Features"+lsNamePieces[1], strLastMetadata=self.funcGetLastMetadataName(), cFileDelimiter = self.funcGetFileDelimiter(), cFeatureNameDelimiter= self.funcGetFeatureDelimiter()) #Table is no longer normalized abndFeature._fIsNormalized = False return abndFeature #Happy path tested def funcGetFeatureDelimiter(self): """ The delimiter of the feature names (For example to use on concensus lineages). :return Character: Delimiter for the feature name pieces if it is complex. """ return self._cFeatureDelimiter #Happy path tested def funcGetFeatureCount(self): """ Returns the current feature count. :return Count: Returns the int count of features in the abundance table. Returns None on error. """ return self._npaFeatureAbundance.shape[0] if not self._npaFeatureAbundance is None else 0 #Happy path tested def funcGetFeatureSumAcrossSamples(self,sFeatureName): """ Returns float sum of feature values across the samples. :param sFeatureName: The feature ID to get the sum from. :type: String. :return Double: Sum of one feature across samples. """ return sum(self.funcGetFeature(sFeatureName)) def funcGetFeature(self,sFeatureName): """ Returns feature values across the samples. :param sFeatureName: The feature ID to get the sum from. :type: String. :return Double: Feature across samples. """ for sFeature in self._npaFeatureAbundance: if sFeature[0] == sFeatureName: return list(sFeature)[1:] return None #Happy path tested def funcGetFeatureNames(self): """ Return the feature names as a list. :return Feature Names: List of feature names (or IDs) as strings. As an error returns empty list. """ if (not self._npaFeatureAbundance == None): return self._npaFeatureAbundance[self.funcGetIDMetadataName()] return [] #Happy path tested def funcGetFileDelimiter(self): """ The delimiter of the file the data was read from and which is also the delimiter which would be used to write the data to a file. :return Character: Delimiter for the parsing and writing the file. """ return self._cDelimiter def funcGetLastMetadataName(self): """ Get the last metadata name that seperates abundance and metadata measurements. :return string: Metadata name """ return self._strLastMetadataName #Happy path tested def funcGetSample(self,sSampleName): """ Return a copy of the feature measurements of a sample. :param sSampleName: Name of sample to return. :type: String :return Sample: Measurements Feature measurements of a sample. Empty numpy array returned on error. """ if (not self._npaFeatureAbundance == None): return self._npaFeatureAbundance[sSampleName].copy() return np.array([]) #Happy path tested def funcGetMetadata(self, strMetadataName): """ Returns a list of metadata that is associated with the given metadata name (id). :param strMetadataName: String metadata ID to be returned :type: String ID :return Metadata: List of metadata """ return copy.deepcopy( self._dictTableMetadata.get(strMetadataName) ) \ if self._dictTableMetadata else None #Happy path tested def funcGetMetadataCopy(self): """ Returns a deep copy of the metadata. :return Metadata copy: {"ID":[value,value...]} """ return copy.deepcopy(self._dictTableMetadata) #Happy path tested def funcGetName(self): """ Returns the name of the object which is the file name that generated it. If the object was generated from an Abundance Table (for instance through stratification) the name is still in the form of a file that could be written to which is informative of the changes that have occurred on the data set. :return string: Name """ return self._strOriginalName #Happy path tested. could do more def funcGetTerminalNodes(self): """ Returns the terminal nodes given the current feature names in the abundance table. The features must contain a consensus lineage or all will be returned. :return List: List of strings of the terminal nodes given the abundance table. """ return AbundanceTable.funcGetTerminalNodesFromList(lsNames=self.funcGetFeatureNames(),cNameDelimiter=self.funcGetFeatureDelimiter()) #Tested 2 test cases @staticmethod def funcGetTerminalNodesFromList(lsNames,cNameDelimiter): """ Returns the terminal nodes given the current feature names in the abundance table. The features must contain a consensus lineage or all will be returned. :param lsNames: The list of string names to parse and filter. :type: List of strings :param cNameDelimiter: The delimiter for the name of the features. :type: Character Delimiter :return list: A list of terminal elements in the list (given only the list). """ #Build hash dictCounts = dict() for strTaxaName in lsNames: #Split into the elements of the clades lsClades = filter(None,strTaxaName.split(cNameDelimiter)) #Count clade levels iCladeLength = len(lsClades) #Evaluate first element sClade = lsClades[0] dictCounts[sClade] = sClade not in dictCounts #Evaluate the rest of the elements if iCladeLength < 2: continue for iIndex in xrange(1,iCladeLength): prevClade = sClade sClade = cNameDelimiter.join([sClade,lsClades[iIndex]]) if sClade in dictCounts: dictCounts[sClade] = dictCounts[prevClade] = False else: dictCounts[sClade] = True dictCounts[prevClade] = False #Return only the elements that were of count 1 return filter( lambda s: dictCounts[s] == True, dictCounts ) #Happy path tested def funcIsNormalized(self): """ Returns if the data has been normalized. :return Boolean: Indicates if the data is normalized. True indicates it the data is normalized. """ return self._fIsNormalized #Happy path tested def funcIsPrimaryIdMetadata(self,sMetadataName): """ Checks the metadata data associated with the sMetadatName and returns if the metadata is unique. This is important to some of the functions in the Abundance Table specifically when translating from one metadata to another. :param sMetadataName: ID of metadata to check for uniqueness. :type: String Metadata ID. :return Boolean: Returns indicator of uniqueness. True indicates unique. """ lMetadata = self.funcGetMetadata(sMetadataName) if not lMetadata: return False return (len(lMetadata) == len(set(lMetadata))) #Happy path tested def funcIsSummed(self): """ Return is the data is summed. :return Boolean: Indicator of being summed. True indicates summed. """ return self._fIsSummed #Happy path tested def funcFilterAbundanceByPercentile(self, dPercentileCutOff = 95.0, dPercentageAbovePercentile=1.0): """ Filter on features. A feature is removed if it's abundance is not found in the top X percentile a certain percentage of the samples. :param dPercentileCutOff: The percentile used for filtering. :type: double A double between 0.0 and 100.0 :param dPercentageAbovePercentile: The percentage above the given percentile (dPercentileCutOff) that must exist to keep the feature. :type: double Between 0.0 and 100.0 :return Boolean: Indicator of filtering occuring without error. True indicates filtering occuring. """ #No need to do anything if(dPercentileCutOff==0.0) or (dPercentageAbovePercentile==0.0): return True #Sample names lsSampleNames = self.funcGetSampleNames() #Scale percentage out of 100 dPercentageAbovePercentile = dPercentageAbovePercentile/100.0 #Sample count iSampleCount = len(lsSampleNames) #Get a threshold score of the value at the specified percentile for each sample #In the order of the sample names ldScoreAtPercentile = [scipy.stats.scoreatpercentile(self._npaFeatureAbundance[lsSampleNames[iIndex]],dPercentileCutOff) for iIndex in xrange(iSampleCount)] #Record how many entries for each feature have a value equal to or greater than the dPercentileCutOff #If the percentile of entries passing the criteria are above the dPercentageAbovePercentile put index in list to keep liKeepIndices = [] iSampleCount = float(iSampleCount) for iRowIndex, npaRow in enumerate(self._npaFeatureAbundance): iCountPass = sum([1 if dValue >= ldScoreAtPercentile[iValueIndex] else 0 for iValueIndex, dValue in enumerate(list(npaRow)[1:])]) if (iCountPass / iSampleCount) >= dPercentageAbovePercentile: liKeepIndices.append(iRowIndex) #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepIndices,:] #Update filter state self._strCurrentFilterState += ":dPercentileCutOff=" + str(dPercentileCutOff) + ",dPercentageAbovePercentile=" + str(dPercentageAbovePercentile) #Table is no longer normalized self._fIsNormalized = False return True def funcFilterAbundanceByMinValue(self, dMinAbundance = 0.0001, iMinSamples = 3): """ Filter abundance by requiring features to have a minimum relative abundance in a minimum number of samples. Will evaluate greater than or equal to the dMinAbundance and iMinSamples. :param dMinAbundance: Minimum relative abundance. :type: Real Number Less than 1. :param iMinSamples: Minimum samples to have the relative abundnace or greater in. :type: Integer Number greater than 1. :return Boolean: Indicator of the filter running without error. False indicates error. """ #No need to do anything if(dMinAbundance==0) or (iMinSamples==0): return True #This normalization requires the data to be relative abundance if not self._fIsNormalized: #sys.stderr.write( "Could not filter by sequence occurence because the data is already normalized.\n" ) return False #Holds which indexes are kept liKeepFeatures = [] for iRowIndex, dataRow in enumerate( self._npaFeatureAbundance ): #See which rows meet the criteria and keep the index if needed. if len( filter( lambda d: d >= dMinAbundance, list(dataRow)[1:] ) ) >= iMinSamples: liKeepFeatures.append(iRowIndex) #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:] #Update filter state self._strCurrentFilterState += ":dMinAbundance=" + str(dMinAbundance) + ",iMinSamples=" + str(iMinSamples) return True #Happy path tested def funcFilterAbundanceBySequenceOccurence(self, iMinSequence = 2, iMinSamples = 2): """ Filter occurence by requiring features to have a minimum sequence occurence in a minimum number of samples. Will evaluate greater than or equal to the iMinSequence and iMinSamples. :param iMinSequence: Minimum sequence to occur. :type: Integer Number Greater than 1. :param iMinSamples: Minimum samples to occur in. :type: Integer Number greater than 1. :return Boolean: Indicator of the filter running without error. False indicates error. """ #No need to do anything if(iMinSequence==0) or (iMinSamples==0): return True #This normalization requires the data to be reads if self._fIsNormalized: #sys.stderr.write( "Could not filter by sequence occurence because the data is already normalized.\n" ) return False #Holds which indexes are kept liKeepFeatures = [] for iRowIndex, dataRow in enumerate( self._npaFeatureAbundance ): #See which rows meet the criteria and keep the index if needed. if len( filter( lambda d: d >= iMinSequence, list(dataRow)[1:] ) ) >= iMinSamples: liKeepFeatures.append(iRowIndex) #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:] #Update filter state self._strCurrentFilterState += ":iMinSequence=" + str(iMinSequence) + ",iMinSamples=" + str(iMinSamples) return True #1 Happy path test def funcFilterFeatureBySD(self, dMinSDCuttOff = 0.0): """ A feature is removed if it's abundance is not found to have standard deviation more than the given dMinSDCutoff. :param dMinSDCuttOff: Standard deviation threshold. :type: Double A double greater than 0.0. :return Boolean: Indicator of success. False indicates error. """ #No need to do anything if(dMinSDCuttOff==0.0): return True #Holds which indexes are kept liKeepFeatures = [] #Evaluate each sample for iRowIndex, dataRow in enumerate(self._npaFeatureAbundance): if(np.std(list(dataRow)[1:])>=dMinSDCuttOff): liKeepFeatures.append(iRowIndex) #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:] #Update filter state self._strCurrentFilterState += ":dMinSDCuttOff=" + str(dMinSDCuttOff) #Table is no longer normalized self._fIsNormalized = False return True #Happy path tested 2 tests def funcGetWithoutOTUs(self): """ Remove features that are terminal otus. Terminal otus are identified as being an integer. """ #Get the feature names lsFeatures = self.funcGetFeatureNames() #Reduce, filter the feature names lsFeatures = [sFeature for sFeature in lsFeatures if not (ValidateData.funcIsValidStringInt(sFeature.split(self.funcGetFeatureDelimiter())[-1]))] return self.funcGetFeatureAbundanceTable(lsFeatures) #Happy path tested def funcNormalize(self): """ Convenience method which will call which ever normalization is approriate on the data. :return Boolean: Indicator of success (true). """ if self._fIsSummed: return self.funcNormalizeColumnsWithSummedClades() else: return self.funcNormalizeColumnsBySum() #Testing Status: Light happy path testing def funcNormalizeColumnsBySum(self): """ Normalize the data in a manner that is approrpiate for NOT summed data. Normalize the columns (samples) of the abundance table. Normalizes as a fraction of the total (number/(sum of all numbers in the column)). Will not act on summed tables. :return Boolean: Indicator of success. False indicates error. """ if self._fIsNormalized: # sys.stderr.write( "This table is already normalized, did not perform new normalization request.\n" ) return False if self._fIsSummed: sys.stderr.write( "This table has clades summed, this normalization is not appropriate. Did not perform.\n" ) return False #Normalize for columnName in self.funcGetSampleNames(): column = self._npaFeatureAbundance[columnName] columnTotal = sum(column) if(columnTotal > 0.0): column = column/columnTotal self._npaFeatureAbundance[columnName] = column #Indicate normalization has occured self._fIsNormalized = True return True #Happy path tested def funcNormalizeColumnsWithSummedClades(self): """ Normalizes a summed Abundance Table. If this is called on a dataset which is not summed and not normalized. The data will be summed first and then normalized. If already normalized, the current normalization is kept. :return Boolean: Indicator of success. False indicates error. """ if self._fIsNormalized: # sys.stderr.write( "This table is already normalized, did not perform new summed normalization request.\n" ) return False if not self._fIsSummed: sys.stderr.write( "This table does not have clades summed, this normalization is not appropriate until the clades are summed. The clades are being summed now before normalization.\n" ) self.funcSumClades() #Load a hash table with root data {sKey: npaAbundances} hashRoots = {} for npaRow in self._npaFeatureAbundance: curldAbundance = np.array(list(npaRow)[1:]) curFeatureNameLength = len(npaRow[0].split(self._cFeatureDelimiter)) curlRootData = hashRoots.get(npaRow[0].split(self._cFeatureDelimiter)[0]) if not curlRootData: hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]] = [curFeatureNameLength, curldAbundance] elif curlRootData[0] > curFeatureNameLength: hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]] = [curFeatureNameLength, curldAbundance] #Normalize each feature by thier root feature dataMatrix = list() for npaRow in self._npaFeatureAbundance: curHashRoot = list(hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]][1]) dataMatrix.append(tuple([npaRow[0]]+[npaRow[i+1]/curHashRoot[i] if curHashRoot[i] > 0 else 0 for i in xrange(len(curHashRoot))])) self._npaFeatureAbundance = np.array(dataMatrix,self._npaFeatureAbundance.dtype) #Indicate normalization has occured self._fIsNormalized = True return True def _funcRankAbundanceHelper( self, aaTodo, iRank, lRankAbundance ): """ Helper method for ranking abudance which are tied. :params aaTodo: List of tied ranks to change to a rank. :type: List of Enumerates of samples. :params iRank: Current Rank :type: Integer :params lRankAbundance: Sample of abundance :type: List of integers """ # Subtract one from iRank (each time) to account for next loop iteration # Then average it with itself minus (the length of aaTodo + 1) dRank = ( iRank + iRank - len( aaTodo ) - 1 ) / 2.0 for a in aaTodo: lRankAbundance[a[0]] = dRank #1 Happy path test def funcRankAbundance(self): """ Rank abundances of features with in a sample. :return AbundanceTable: Abundance table data ranked (Features with in samples). None is returned on error. """ if self._npaFeatureAbundance == None: return None lsSampleNames = self.funcGetSampleNames() npRankAbundance = self.funcGetAbundanceCopy() liRanks = [] #For each sample name get the ranks for sName in lsSampleNames: #Enumerate for order and sort abundances lfSample = list(enumerate(npRankAbundance[sName])) lfSample = sorted(lfSample, key = lambda a: a[1], reverse = True) # Accumulate indices until a new value is encountered to detect + handle ties aaTodo = [] for i, a in enumerate( lfSample ): if ( not aaTodo ) or ( a[1] == aaTodo[-1][1] ): aaTodo.append( a ) else: # Make multiple tied ranks = average of first and last self._funcRankAbundanceHelper( aaTodo, i, npRankAbundance[sName] ) aaTodo = [a] self._funcRankAbundanceHelper( aaTodo, i + 1, npRankAbundance[sName] ) abndRanked = AbundanceTable(npaAbundance=npRankAbundance, dictMetadata=self.funcGetMetadataCopy(), strName= self.funcGetName() + "-Ranked", strLastMetadata=self.funcGetLastMetadataName(), cFileDelimiter=self.funcGetFileDelimiter(), cFeatureNameDelimiter=self.funcGetFeatureDelimiter()) #Table is no longer normalized abndRanked._fIsNormalized = False return abndRanked def funcGetSampleCount(self): """ Returns the sample count of the abundance table. """ return len(self.funcGetSampleNames()) #Happy Path Tested def funcReduceFeaturesToCladeLevel(self, iCladeLevel): """ Reduce the current table to a certain clade level. :param iCladeLevel: The level of the clade to trim the features to. :type: Integer The higher the number the more clades are presevered in the consensus lineage contained in the feature name. :return Boolean: Indicator of success. False indicates error. """ if iCladeLevel < 1: return False if not self._npaFeatureAbundance == None: liFeatureKeep = [] [liFeatureKeep.append(tplFeature[0]) if (len(tplFeature[1][0].split(self.funcGetFeatureDelimiter())) <= iCladeLevel) else 0 for tplFeature in enumerate(self._npaFeatureAbundance)] #Compress array self._npaFeatureAbundance = self._npaFeatureAbundance[liFeatureKeep,:] #Update filter state self._strCurrentFilterState += ":iCladeLevel=" + str(iCladeLevel) return True else: return False #Happy path tested def funcRemoveSamples(self,lsSampleNames): """ Removes the samples given in the list. :param lsSampleNames: A list of string names of samples to remove. :type: List of strings Unique values :return Boolean: Indicator of success (True = success, no error) """ #Samples to remove setSamples = set(lsSampleNames) #Get orignal sample count iOriginalCount = self._iOriginalSampleCount #The samples to keep lsKeepSamples = [sSample for sSample in self.funcGetSampleNames() if not sSample in setSamples] #The sample to keep as boolean flags for compressing the metadata lfKeepSamples = [not sSample in setSamples for sSample in self.funcGetSampleNames()] #Reduce the abundance data and update self._npaFeatureAbundance = self._npaFeatureAbundance[[self.funcGetIDMetadataName()]+lsKeepSamples] #Reduce the metadata and update for sKey in self._dictTableMetadata: self._dictTableMetadata[sKey] = [value for iindex, value in enumerate(self._dictTableMetadata[sKey]) if lfKeepSamples[iindex]] #Update sample number count self._iOriginalSampleCount = len(self.funcGetSampleNames()) return self._iOriginalSampleCount == (iOriginalCount-len(setSamples)) #Happy path tested def funcRemoveSamplesByMetadata(self, sMetadata, lValuesToRemove): """ Removes samples from the abundance table based on values of a metadata. If a metadata has any value given the associated sample is removed. :param sMetadata: ID of the metdata to check the given values. :type: String Metadata ID :param lValuesToRemove: A list of values which if equal to a metadata entry indicate to remove the associated sample. :type: List of values: List :return Boolean: Indicator of success (True = success, no error) """ lsSampleNames = self.funcGetSampleNames() return self.funcRemoveSamples([lsSampleNames[iindex] for iindex, sValue in enumerate(self.funcGetMetadata(sMetadata)) if sValue in lValuesToRemove]) #Happy path testing def funcSumClades(self): """ Sums abundance data by clades indicated in the feature name (as consensus lineages). :return Boolean: Indicator of success. False indicates an error. """ if not self.funcIsSummed(): #Read in the data #Find the header column (iCol) assumed to be 1 or 2 depending on the location of "NAME" #Create a list (adSeq) that will eventually hold the sum of the columns of data astrHeaders = iCol = None adSeqs = np.array([0] * len(self.funcGetSampleNames())) pTree = CClade( ) aastrRaw = [] #For each row in the npaAbundance #Get the feature name, feature abundances, and sum up the abudance columns #Keep the sum for later normalization #Give a tree the feature name and abundance for dataRow in self._npaFeatureAbundance: sFeatureName = dataRow[0] ldAbundances = list(dataRow)[1:] #Add to the sum of the columns (samples) adSeqs = adSeqs + np.array(list(dataRow)[1:]) #Build tree pTree.get( sFeatureName.split(self._cFeatureDelimiter) ).set( ldAbundances ) #Create tree of data #Input missing data #Fill hashFeatures with the clade name (key) and a blist of values (value) of the specified level interested. pTree.impute( ) hashFeatures = {} pTree.freeze( hashFeatures, c_iSumAllCladeLevels, c_fOutputLeavesOnly ) setstrFeatures = hashFeatures.keys( ) #Remove parent clades that are identical to child clades for strFeature, adCounts in hashFeatures.items( ): astrFeature = strFeature.strip( ).split( "|" ) while len( astrFeature ) > 1: astrFeature = astrFeature[:-1] strParent = "|".join( astrFeature ) adParent = hashFeatures.get( strParent ) if adParent == adCounts: del hashFeatures[strParent] setstrFeatures.remove( strParent ) #Sort features to be nice astrFeatures = sorted( setstrFeatures ) #Change the hash table to an array dataMatrix = list() for sFeature in astrFeatures: dataMatrix.append(tuple([sFeature]+list(hashFeatures[sFeature]))) self._npaFeatureAbundance=np.array(dataMatrix,self._npaFeatureAbundance.dtype) #Indicate summation has occured self._fIsSummed = True return True #Happy path tested def funcStratifyByMetadata(self, strMetadata, fWriteToFile=False): """ Stratifies the AbundanceTable by the given metadata. Will write each stratified abundance table to file if fWriteToFile is True the object will used it's internally stored name as a file to write to if fWriteToFile is a string then it should be a directory and end with "." This will rebase the file and store it in a different directory but with an otherwise unchanged name. Note: If the metadata used for stratification has NAs, they will be segregated to thier own table and returned. :param strMetadata: Metadata ID to stratify data with. :type: String ID for a metadata. :param fWriteToFile: Indicator to write to file. :type: Boolean True indicates to write to file. :return List: List of AbundanceTables which are deep copies of the original. Empty list on error. """ if self._npaFeatureAbundance is None or self._dictTableMetadata is None: return [] #Get unique metadata values to stratify by lsMetadata = self._dictTableMetadata.get(strMetadata,[]) setValues = set(lsMetadata) #If there is only one metadata value then no need to stratify so return the original in the list (and write if needed) if len(setValues) == 0: return [] retlAbundanceTables = [] dictAbundanceBlocks = dict() #Given here there are multiple metadata values, continue to stratify lsNames = self.funcGetSampleNames() #Get index of values to break up for value in setValues: lfDataIndex = [sData==value for sData in lsMetadata] #Get abundance data for the metadata value #The true is added to keep the first column which should be the feature id npaStratfiedAbundance = self._npaFeatureAbundance[[self.funcGetIDMetadataName()]+list(np.compress(lfDataIndex,lsNames))] #Get metadata for the metadata value dictStratifiedMetadata = dict() for metadataType in self._dictTableMetadata: dictValues = self.funcGetMetadata(metadataType) dictStratifiedMetadata[metadataType] = np.compress(lfDataIndex,dictValues).tolist() #Make abundance table #Add abundance table to the list lsNamePieces = os.path.splitext(self._strOriginalName) objStratifiedAbundanceTable = AbundanceTable(npaAbundance=npaStratfiedAbundance, dictMetadata=dictStratifiedMetadata, strName=lsNamePieces[0] + "-StratBy-" + value+lsNamePieces[1], strLastMetadata=self.funcGetLastMetadataName(), cFeatureNameDelimiter=self._cFeatureDelimiter, cFileDelimiter = self._cDelimiter) if fWriteToFile: objStratifiedAbundanceTable.funcWriteToFile(lsNamePieces[0] + "-StratBy-" + value+lsNamePieces[1]) #Append abundance table to returning list retlAbundanceTables.append(objStratifiedAbundanceTable) return retlAbundanceTables #Happy Path Tested def funcTranslateIntoMetadata(self, lsValues, sMetadataFrom, sMetadataTo, fFromPrimaryIds=True): """ Takes the given data values in one metadata and translates it to values in another metadata of the sample samples holding the values of the first metadata FPrimaryIds, if true the sMetadataFrom are checked for unique values, If FPrimaryIds is not true, duplicate values can stop the preservation of order Or may cause duplication in the "to" group. This is not advised. if the sMetadataFrom has any duplicates the function fails and return false. :param lsValues: Values to translate. :type: List List of values. :param sMetadataFrom: The metadata the lsValues come from. :type: String ID for the metadata. :param sMetadataTo: The metadata the lsValues will be translated into keeping the samples the same. :type: String ID for the metadata. :param fFromPrimaryIds: The metadata that are in the from metadata list must be unique in each sample. :type: Boolean True indicates the metadata list should be unique in each sample. Otherwise a false will return. :return List: List of new values or False on error. """ #Get metadata lFromMetadata = self.funcGetMetadata(sMetadataFrom) if not lFromMetadata: sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. Did not receive lFromMetadata.\n" ) return False lToMetadata = self.funcGetMetadata(sMetadataTo) if not lToMetadata: sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. Did not receive lToMetadata.\n" ) return False #Check to see if the values are unique if indicated to do so if fFromPrimaryIds: if not len(lFromMetadata) == len(set(lFromMetadata)): sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. sMetadataFrom did not have unique values.\n" ) return False #Translate over if lFromMetadata and lToMetadata: return [lToMetadata[iIndex] for iIndex in [lFromMetadata.index(value) for value in lsValues]] return False #Happy path tested def funcToArray(self): """ Returns a numpy array of the current Abundance Table. Removes the first ID head column and the numpy array is Made of lists, not tuples. :return Numpy Array: np.array([[float,float,...],[float,float,...],[float,float,...]]) None is returned on error. """ if not self._npaFeatureAbundance == None: return np.array([list(tplRow)[1:] for tplRow in self._npaFeatureAbundance],'float') return None #Happy Path tested def funcWriteToFile(self, xOutputFile, cDelimiter=None, cFileType=ConstantsBreadCrumbs.c_strPCLFile): """ Writes the AbundanceTable to a file strOutputFile. Will rewrite over a file as needed. Will use the cDelimiter to delimit columns if provided. :param xOutputFile: File stream or File path to write the file to. :type: String File Path :param cDelimiter: Delimiter for the output file. :type: Character If cDlimiter is not specified, the internally stored file delimiter is used. """ if not xOutputFile: return # Check delimiter argument if not cDelimiter: cDelimiter = self._cDelimiter # Check file type: If pcl: Write pcl file; If biom: write biom file; If None - write pcl file if(cFileType == None): cFileType == ConstantsBreadCrumbs.c_strPCLFile if(cFileType == ConstantsBreadCrumbs.c_strPCLFile): # Write as a pcl file self._funcWritePCLFile(xOutputFile, cDelimiter=cDelimiter) elif(cFileType == ConstantsBreadCrumbs.c_strBiomFile): #Write as a biom file self._funcWriteBiomFile(xOutputFile) return def _funcWritePCLFile(self, xOutputFile, cDelimiter=None): """ Write an abundance table object as a PCL file. :param xOutputFile: File stream or File path to write the file to. :type: String File Path :param cDelimiter: Delimiter for the output file. :type: Character If cDlimiter is not specified, the internally stored file delimiter is used. """ f = csv.writer(open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile, csv.excel_tab, delimiter=cDelimiter) # Get Row metadata id info (IDs for column header, keys that line up with the ids) lsRowMetadataIDs, lsRowMetadataIDKeys = self.rwmtRowMetadata.funcMakeIDs() if self.rwmtRowMetadata else [[],[]] #Write Ids f.writerows([[self.funcGetIDMetadataName()]+lsRowMetadataIDs+list(self.funcGetSampleNames())]) #Write column metadata lsKeys = list(set(self._dictTableMetadata.keys())-set([self.funcGetIDMetadataName(),self.funcGetLastMetadataName()])) lMetadataIterations = list(set(lsKeys+[self.funcGetLastMetadataName()] )) f.writerows([[sMetaKey]+([ConstantsBreadCrumbs.c_strEmptyDataMetadata]*len(lsRowMetadataIDs))+self.funcGetMetadata(sMetaKey) for sMetaKey in lMetadataIterations if sMetaKey != self.funcGetIDMetadataName() and not sMetaKey is None]) #Write abundance lsOutput = list() curAbundance = self._npaFeatureAbundance.tolist() for curAbundanceRow in curAbundance: # Make feature metadata, padding with NA as needed lsMetadata = [] for sMetadataId in lsRowMetadataIDKeys: lsMetadata = lsMetadata + self.rwmtRowMetadata.funGetFeatureMetadata( curAbundanceRow[0], sMetadataId ) lsMetadata = lsMetadata + ( [ ConstantsBreadCrumbs.c_strEmptyDataMetadata ] * ( self.rwmtRowMetadata.dictMetadataIDs.get( sMetadataId, 0 ) - len( lsMetadata ) ) ) f.writerows([[curAbundanceRow[0]]+lsMetadata+[str(curAbundanceElement) for curAbundanceElement in curAbundanceRow[1:]]]) return def _funcWriteBiomFile(self, xOutputFile): """ Write an abundance table object as a Biom file. :param xOutputFile: File stream or File path to write the file to. :type: String File Path """ #************************** # Get Sample Names * #************************** lSampNames = list(self.funcGetSampleNames()) #************************** # Metadata Names * #************************** dictMetadataCopy = self.funcGetMetadataCopy() lMetaData = list() iKeysCounter = 0 for lMetadataCopyEntry in dictMetadataCopy.iteritems(): iKeysCounter +=1 sMetadataName = lMetadataCopyEntry[0] lMetadataEntries = lMetadataCopyEntry[1] iMetadataEntryCounter = -1 for sMetadataEntry in lMetadataEntries: iMetadataEntryCounter+=1 dictMetadataNames = dict() dictMetadataNames[sMetadataName ] = sMetadataEntry if iKeysCounter == 1: lMetaData.append(dictMetadataNames) else: lMetaData[iMetadataEntryCounter][sMetadataName ] = sMetadataEntry #************************** # Observation Ids * # and row metadata * #************************** bTaxonomyInRowsFlag = False if self.rwmtRowMetadata.dictRowMetadata is not None: bTaxonomyInRowsFlag = True lObservationMetadataTable = list() lObservationIds = list() lFeatureNamesResultArray = self.funcGetFeatureNames() for sFeatureName in lFeatureNamesResultArray: lObservationIds.append(sFeatureName) if self.rwmtRowMetadata and self.rwmtRowMetadata.dictRowMetadata: RowMetadataEntry = self.rwmtRowMetadata.dictRowMetadata[sFeatureName][ConstantsBreadCrumbs.c_metadata_lowercase] lObservationMetadataTable.append( RowMetadataEntry ) #************************** # Data * #************************** lData = list() lAbundanceCopyResultArray = self.funcGetAbundanceCopy() for r in lAbundanceCopyResultArray: lr = list(r) lr.pop(0) #Remove metadata lAbundanceValues = list() for AbundanceEntry in lr: flAbundanceEntry = float(AbundanceEntry) lAbundanceValues.append(flAbundanceEntry) lData.append(lAbundanceValues) arrData = array(lData) #Convert list to array #************************** # Invoke the * # biom table factory * #************************** if bTaxonomyInRowsFlag == False: BiomTable = table_factory(arrData, lSampNames, lObservationIds, lMetaData, constructor=SparseOTUTable) else: #There was metadata in the rows BiomTable = table_factory(arrData, lSampNames, lObservationIds, lMetaData, lObservationMetadataTable if len(lObservationMetadataTable) > 0 else None, constructor=SparseOTUTable) #************************** # Generate biom Output * #************************** f = open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile f.write(BiomTable.getBiomFormatJsonString(ConstantsBreadCrumbs.c_biom_file_generated_by)) f.close() return #Testing Status: 1 Happy path test @staticmethod def funcPairTables(strFileOne, strFileTwo, strIdentifier, cDelimiter, strOutFileOne, strOutFileTwo, lsIgnoreValues=None): """ This method will read in two files and abridge both files (saved as new files) to just the samples in common between the two files given a common identifier. ***If the identifier is not unique in each data set, the first sample with the pairing id is taken so make sure the ID is unique. Expects the files to have the sample delimiters. :param strFileOne: Path to file one to be paired. :type: String File path. :param strFileTwo: Path to file two to be paired. :type: String File path. :param strIdentifier: Metadata ID that is used for pairing. :type: String Metadata ID. :param cDelimiter: Character delimiter to read the files. :type: Character Delimiter. :param strOutFileOne: The output file for the paired version of the first file. :type: String File path. :param strOutFileTwo: The output file for the paired version of the second file. :type: String File path. :param lsIgnoreValues: These values are ignored even if common IDs between the two files. :type: List List of strings. :return Boolean: Indicator of no errors. False indicates errors. """ #Validate parameters if(not ValidateData.funcIsValidFileName(strFileOne)): sys.stderr.write( "AbundanceTable:checkRawDataFile::Error, file not valid. File:" + strFileOne + "\n" ) return False #Validate parameters if(not ValidateData.funcIsValidFileName(strFileTwo)): sys.stderr.write( "AbundanceTable:checkRawDataFile::Error, file not valid. File:"+ strFileTwo + "\n" ) return False #Make file one #Read in file istm = csv.reader(open(strFileOne,'rU'), csv.excel_tab, delimiter=cDelimiter) lsContentsOne = [lsRow for lsRow in istm] #Get the file identifier for file one fileOneIdentifier = None for sLine in lsContentsOne: if sLine[0] == strIdentifier: fileOneIdentifier = sLine break #Make file two #Read in file istm = csv.reader(open(strFileTwo,'rU'), csv.excel_tab, delimiter=cDelimiter) lsContentsTwo = [lsRow for lsRow in istm] #Get the file identifier for file two fileTwoIdentifier = None for sLine in lsContentsTwo: if sLine[0] == strIdentifier: fileTwoIdentifier = sLine break #Get what is in common between the identifiers #And find which columns to keep in the tables based on the common elements setsCommonIdentifiers = set(fileOneIdentifier) & set(fileTwoIdentifier) if lsIgnoreValues: setsCommonIdentifiers = setsCommonIdentifiers - set(lsIgnoreValues) #Get positions of common identifiers in each data set, if the identifier is not unique in a date set just take the first index lfFileOneIDIndexes = [fileOneIdentifier.index(sCommonID) for sCommonID in setsCommonIdentifiers] lfFileTwoIDIndexes = [fileTwoIdentifier.index(sCommonID) for sCommonID in setsCommonIdentifiers] #Convert index list to list of boolean lfFileOneElements = [iIndex in lfFileOneIDIndexes for iIndex, sIdentifier in enumerate(fileOneIdentifier)] lfFileTwoElements = [iIndex in lfFileTwoIDIndexes for iIndex, sIdentifier in enumerate(fileTwoIdentifier)] #Write out file one ostm = csv.writer(open(strOutFileOne,'w'), csv.excel_tab, delimiter=cDelimiter) (ostm.writerows([np.compress(lfFileOneElements,sLine) for sLine in lsContentsOne])) #Write out file two ostm = csv.writer(open(strOutFileTwo,'w'), csv.excel_tab, delimiter=cDelimiter) (ostm.writerows([np.compress(lfFileTwoElements,sLine) for sLine in lsContentsTwo])) return True #Testing Status: Light happy path testing @staticmethod def funcStratifyAbundanceTableByMetadata(strInputFile = None, strDirectory = "", cDelimiter = ConstantsBreadCrumbs.c_cTab, iStratifyByRow = 1, llsGroupings = []): """ Splits an abundance table into multiple abundance tables stratified by the metadata :param strInputFile: String file path to read in and stratify. :type: String File path. :param strDirectory: Output directory to write stratified files. :type: String Output directory path. :param cDelimiter: The delimiter used in the adundance file. :type: Character Delimiter. :param iStratifyByRow: The row which contains the metadata to use in stratification. :type: Integer Positive integer index. :param llsGroupings: A list of string lists where each string list holds values that are equal and should be grouped together. So for example, if you wanted to group metadata "1", "2", and "3" seperately but "4" and "5" together you would Give the following [["4","5"]]. If you know what "1" and "3" also together you would give [["1","3"],["4","5"]] :type List List of list of strings :return Boolean: Indicator of NO error. False indicates an error. """ #Validate parameters if(not ValidateData.funcIsValidFileName(strInputFile)): sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, file not valid. File:" + strInputFile + "\n" ) return False if(not ValidateData.funcIsValidStringType(cDelimiter)): sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, Delimiter is not a valid string/char type. Delimiter =" + cDelimiter + "\n" ) return False if(not ValidateData.funcIsValidPositiveInteger(iStratifyByRow, tempZero = True) and (not ValidateData.funcIsValidString(iStratifyByRow))): sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, Stratify by row is not a positive integer or string keyword. Row =" + str(iStratifyByRow) + ".\n" ) return False #Get the base of the file path #This is dependent on the given output directory and the prefix of the file name of the input file #If no output file is given then the input file directory is used. baseFilePath = strDirectory lsFilePiecesExt = os.path.splitext(strInputFile) if baseFilePath: baseFilePath = baseFilePath + os.path.splitext(os.path.split(strInputFile)[1])[0] else: baseFilePath = lsFilePiecesExt[0] #Read in file istm = csv.reader(open(strInputFile,'rU'), csv.excel_tab, delimiter=cDelimiter) sFileContents = [lsRow for lsRow in istm] #Collect metadata metadataInformation = dict() #If the tempStratifyRow is by key word than find the index if ValidateData.funcIsValidString(iStratifyByRow): for iLineIndex, strLine in enumerate(sFileContents): if strLine[0].strip("\"") == iStratifyByRow: iStratifyByRow = iLineIndex break #Stratify by metadata row #Split metadata row into metadata entries #And put in a dictionary containing {"variable":[1,2,3,4 column index]} stratifyByRow = sFileContents[iStratifyByRow] for metaDataIndex in xrange(1,len(stratifyByRow)): metadata = stratifyByRow[metaDataIndex] #Put all wierd categories, none, whitespace, blank space metadata cases into one bin if not metadata or metadata in string.whitespace: metadata = "Blank" #Remove any extraneous formatting metadata = metadata.strip(string.whitespace) #Store processed metadata with column occurence in dictionary if(not metadata in metadataInformation): metadataInformation[metadata] = [] metadataInformation[metadata].append(metaDataIndex) #For each of the groupings #Use the first value as the primary value which the rest of the values in the list are placed into #Go through the dict holding the indices and extend the list for the primary value with the secondary values #Then set the secondary value list to empty so that it will be ignored. if llsGroupings: for lSKeyGroups in llsGroupings: if len(lSKeyGroups) > 1: for sGroup in lSKeyGroups[1:]: if sGroup in metadataInformation: metadataInformation[lSKeyGroups[0]].extend(metadataInformation[sGroup]) metadataInformation[sGroup] = [] #Stratify data stratifiedAbundanceTables = dict() for tableRow in sFileContents: if(len(tableRow)> 1): for metadata in metadataInformation: #[0] includes the taxa line columns = metadataInformation[metadata] if columns: columns = [0] + columns lineList = list() for column in columns: lineList.append(tableRow[column]) stratifiedAbundanceTables.setdefault(metadata,[]).append(lineList) #Write to file lsFilesWritten = [] for metadata in stratifiedAbundanceTables: sOutputFile = baseFilePath+"-by-"+metadata.strip("\"")+lsFilePiecesExt[1] f = csv.writer(open(sOutputFile,'w'), csv.excel_tab, delimiter = cDelimiter ) f.writerows(stratifiedAbundanceTables[metadata]) lsFilesWritten.append(sOutputFile) return lsFilesWritten #******************************************* #* biom interface functions: * #* 1. _funcBiomToStructuredArray * #* 2. _funcDecodeBiomMetadata * #******************************************* @staticmethod def _funcBiomToStructuredArray(xInputFile = None): """ Reads the biom input file and builds a "BiomCommonArea" that contains: 1.BiomCommonArea['sLastMetadata'] - This is the name of the last Metadata (String) 2.BiomCommonArea['BiomTaxData']- dict() - going to be used as lcontents[0]==TaxData 3.BiomCommonArea['Metadata'] - dict() - going to be used as lcontents[1]==MetaData 4.BiomCommonArea['BiomFileInfo'] - dict() - going to be used as lcontents[2]==FileInfo (id, format:eg. Biological Observation Matrix 0.9.1) etc. 5.BiomCommonArea['column_metadata_id'] - This is a string which is the name of the column id :param xInputFile: File path of biom file to read. :type: String File path. :return: BiomCommonArea (See description above) :type: dict() """ #******************************************* #* Build the metadata * #******************************************* try: BiomTable = parse_biom_table(open(xInputFile,'U') if isinstance(xInputFile, str) else xInputFile) #Import the biom file except: print("Failure decoding biom file - please check your input biom file and rerun") BiomCommonArea = None return BiomCommonArea BiomCommonArea = dict() dBugNames = list() #Bug Names Table dRowsMetadata = None #Initialize the np.array of the Rows metadata BiomElements = BiomTable.getBiomFormatObject('') for BiomKey, BiomValue in BiomElements.iteritems(): #**************************************************** #* Checking the different keys: format, * #* rows, columns, date, generated_by * #**************************************************** if (BiomKey == ConstantsBreadCrumbs.c_strFormatKey or BiomKey == ConstantsBreadCrumbs.c_strFormatUrl or BiomKey == ConstantsBreadCrumbs.c_MatrixTtype or BiomKey == ConstantsBreadCrumbs.c_strTypekey or BiomKey == ConstantsBreadCrumbs.c_strIDKey #Same as below or BiomKey == ConstantsBreadCrumbs.c_GeneratedBy #<---Need to follow up with Biom as always BiomValue = "" even though in the file has a value or BiomKey == ConstantsBreadCrumbs.c_strDateKey): #Same as above BiomCommonArea = AbundanceTable._funcInsertKeyToCommonArea(BiomCommonArea, BiomKey, BiomValue) if BiomKey == ConstantsBreadCrumbs.c_rows: iMaxIdLen = 0 for iIndexRowMetaData in range(0, len(BiomValue)): if ConstantsBreadCrumbs.c_id_lowercase in BiomValue[iIndexRowMetaData]: sBugName = BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase] dBugNames.append(sBugName) #Post to the bug table if len(sBugName) > iMaxIdLen: #We are calculating dynamically the length of the ID iMaxIdLen = len(sBugName) if ConstantsBreadCrumbs.c_metadata_lowercase in BiomValue[0] and BiomValue[0][ConstantsBreadCrumbs.c_metadata_lowercase] != None : dRowsMetadata = AbundanceTable._funcBiomBuildRowMetadata(BiomValue, iMaxIdLen ) if BiomKey == ConstantsBreadCrumbs.c_columns: BiomCommonArea = AbundanceTable._funcDecodeBiomMetadata(BiomCommonArea, BiomValue, iMaxIdLen) #Call the subroutine to Build the metadata #******************************************* #* Build the TaxData * #******************************************* BiomTaxDataWork = list() #Initlialize TaxData BiomObservations = BiomTable.iterObservations(conv_to_np=True) #Invoke biom method to fetch data from the biom file for BiomObservationData in BiomObservations: sBugName = str( BiomObservationData[1]) BiomTaxDataEntry = list() BiomTaxDataEntry.append(sBugName) BiomObservationsValues = BiomObservationData[0] for BiomDataValue in BiomObservationsValues: BiomTaxDataEntry.append(BiomDataValue) BiomTaxDataWork.append(tuple(BiomTaxDataEntry)) BiomCommonArea[ConstantsBreadCrumbs.c_BiomTaxData] = np.array(BiomTaxDataWork,dtype=np.dtype(BiomCommonArea[ConstantsBreadCrumbs.c_Dtype])) BiomCommonArea[ConstantsBreadCrumbs.c_dRowsMetadata] = RowMetadata(dRowsMetadata) del(BiomCommonArea[ConstantsBreadCrumbs.c_Dtype]) #Not needed anymore return BiomCommonArea @staticmethod def _funcDecodeBiomMetadata(BiomCommonArea, BiomValue = None, iMaxIdLen=0 ): """ Decode the Biom Metadata and build: 1. BiomCommonArea['Metadata'] 2. BiomCommonArea['Dtype'] 3. BiomCommonArea['sLastMetadata'] 4. BiomCommonArea['column_metadata_id'] - This is a string which is the name of the column id These elements will be formatted and passed down the line to build the AbundanceTable :param BiomValue: The "columns" Metadata from the biom file (Contains the Metadata information) :type: dict() :param iMaxIdLen: The maximum length of a row ID :type: Integer :return: BiomCommonArea :type: dict() """ BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata] = None #Initialize the LastMetadata element BiomCommonArea['dRowsMetadata'] = None #Initialize for cases that there is no metadata in the rows strLastMetadata = None strIDMetadata = None lenBiomValue = len(BiomValue) BiomMetadata = dict() for cntMetadata in range(0, lenBiomValue): BiomMetadataEntry = BiomValue[cntMetadata] for key, value in BiomMetadataEntry.iteritems(): #Loop on the entries if key == ConstantsBreadCrumbs.c_id_lowercase: #If id - process it strIDMetadata = ConstantsBreadCrumbs.c_ID if ConstantsBreadCrumbs.c_ID not in BiomMetadata: #If ID not in the common area - initalize it BiomMetadata[ConstantsBreadCrumbs.c_ID] = list() #Initialize a list for indx in range(0, lenBiomValue): #And post the values BiomMetadata[ConstantsBreadCrumbs.c_ID].append(None) BiomMetadata[ConstantsBreadCrumbs.c_ID][cntMetadata] = value.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore) if key == ConstantsBreadCrumbs.c_metadata_lowercase: #If key = metadata if not value is None: #And value is not empty MetadataDict = value #Initialize a dictionary and post the values for MDkey, MDvalue in MetadataDict.iteritems(): if type(MDkey) == unicode : MDkeyAscii = MDkey.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore) else: MDkeyAscii = MDkey if type(MDvalue) == unicode: MDvalueAscii = MDvalue.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore) else: MDvalueAscii = MDvalue if len(MDkeyAscii) > 0: #Search for the last metadata if not strIDMetadata: strIDMetadata = MDkeyAscii BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata] = MDkeyAscii #Set the last Metadata if MDkeyAscii not in BiomMetadata: BiomMetadata[MDkeyAscii] = list() for indx in range(0, lenBiomValue): BiomMetadata[MDkeyAscii].append(None) BiomMetadata[MDkeyAscii][cntMetadata] = MDvalueAscii BiomCommonArea[ConstantsBreadCrumbs.c_Metadata] = BiomMetadata BiomCommonArea[ConstantsBreadCrumbs.c_MetadataID] = strIDMetadata #********************************************** #* Build dtype * #********************************************** BiomDtype = list() iMaxIdLen+=10 #Increase it by 10 BiomDtypeEntry = list() FirstValue = ConstantsBreadCrumbs.c_ID SecondValue = "a" + str(iMaxIdLen) BiomDtypeEntry.append(FirstValue) BiomDtypeEntry.append(SecondValue) BiomDtype.append(tuple(BiomDtypeEntry)) for a in BiomMetadata[ConstantsBreadCrumbs.c_ID]: BiomDtypeEntry = list() FirstValue = a.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore) SecondValue = ConstantsBreadCrumbs.c_f4 BiomDtypeEntry.append(FirstValue) BiomDtypeEntry.append(SecondValue) BiomDtype.append(tuple(BiomDtypeEntry)) BiomCommonArea[ConstantsBreadCrumbs.c_Dtype] = BiomDtype return BiomCommonArea @staticmethod def _funcBiomBuildRowMetadata( BiomValue, iMaxIdLen ): """ Builds the row metadata from a BIOM value :param BiomValue: BIOM Value from the BIOM JSON parsing :type: Complex dict of string pairs and dicts :param iMaxIdLen: Maximum length of all the IDs :type: int :return: dictRowsMetadata - np Array containing the rows metadata :type: {string feature id: {'metadata': {'taxonomy': [list of metadata values]}}} """ # Build the input dict for RowMetadata from a dict of dicts from a BIOM file dictRowsMetadata = dict() for iIndexRowMetaData in range(0, len(BiomValue)): dictRowsMetadata[str(BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase])] = dict() RowMetadataEntryFromTable = BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_metadata_lowercase] dMetadataTempDict = dict() for key, value in RowMetadataEntryFromTable.iteritems(): dMetadataTempDict[key] = value dictRowsMetadata[str(BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase])][ConstantsBreadCrumbs.c_metadata_lowercase] = dMetadataTempDict return dictRowsMetadata @staticmethod def _funcInsertKeyToCommonArea(BiomCommonArea, BiomKey, BiomValue): """ Inserts the keys into the BiomCommonArea["BiomFileInfo"] :param BiomCommonArea - The common area that has been built before :type: dict() :param BiomKey - The current key (eg. format, date, generated by) :type: str :param BiomValue - The current value of the key (eg. for format: "Biological Observation Matrix 0.9.1") :type: str :return: BiomCommonArea - The updated common area :type: dict() """ if ConstantsBreadCrumbs.c_BiomFileInfo not in BiomCommonArea: BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo] = dict() strInsertKey = BiomKey #Set Default - But it is now always the same... (eg. URL is not: format_url -->url and others) PostBiomValue = BiomValue #The default value to be posted if BiomKey == ConstantsBreadCrumbs.c_strFormatUrl: strInsertKey = ConstantsBreadCrumbs.c_strURLKey if BiomKey == ConstantsBreadCrumbs.c_MatrixTtype: strInsertKey = ConstantsBreadCrumbs.c_strSparsityKey if BiomKey == ConstantsBreadCrumbs.c_GeneratedBy: PostBiomValue = None if BiomKey == ConstantsBreadCrumbs.c_strDateKey: PostBiomValue = None BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo][strInsertKey] = PostBiomValue return BiomCommonArea