micropita: src/breadcrumbs/src/Metric.py comparison

comparison src/breadcrumbs/src/Metric.py @ 0:2f4f6f08c8c4 draft

Uploaded

author	george-weingart
date	Tue, 13 May 2014 21:58:57 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:2f4f6f08c8c4
+"""
+Author: Timothy Tickle
+Description: Calculates Metrics.
+"""
+#####################################################################################
+#Copyright (C) <2012>
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy of
+#this software and associated documentation files (the "Software"), to deal in the
+#Software without restriction, including without limitation the rights to use, copy,
+#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+#and to permit persons to whom the Software is furnished to do so, subject to
+#the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all copies
+#or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#####################################################################################
+__author__ = "Timothy Tickle"
+__copyright__ = "Copyright 2012"
+__credits__ = ["Timothy Tickle"]
+__license__ = "MIT"
+__maintainer__ = "Timothy Tickle"
+__email__ = "ttickle@sph.harvard.edu"
+__status__ = "Development"
+#Update path
+from ConstantsBreadCrumbs import ConstantsBreadCrumbs
+import csv
+import numpy as np
+from types import *
+from ValidateData import ValidateData
+#External libraries
+from cogent.maths.unifrac.fast_unifrac import fast_unifrac_file
+import cogent.maths.stats.alpha_diversity
+import scipy.spatial.distance
+class Metric:
+"""
+Performs ecological measurements.
+"""
+#Diversity metrics Alpha
+c_strSimpsonDiversity = "SimpsonD"
+c_strInvSimpsonDiversity = "InSimpsonD"
+c_strChao1Diversity = "Chao1"
+#Diversity metrics Beta
+c_strBrayCurtisDissimilarity = "B_Curtis"
+c_strUnifracUnweighted = "unifrac_unweighted"
+c_strUnifracWeighted = "unifrac_weighted"
+#Additive inverses of beta metrics
+c_strInvBrayCurtisDissimilarity = "InB_Curtis"
+#Richness
+c_strShannonRichness = "ShannonR"
+c_strObservedCount = "Observed_Count"
+#Different alpha diversity metrics
+setAlphaDiversities = set(["observed_species","margalef","menhinick",
+	"dominance","reciprocal_simpson","shannon","equitability","berger_parker_d",
+	"mcintosh_d","brillouin_d","strong","fisher_alpha","simpson",
+	"mcintosh_e","heip_e","simpson_e","robbins","michaelis_menten_fit","chao1","ACE"])
+#Different beta diversity metrics
+setBetaDiversities = set(["braycurtis","canberra","chebyshev","cityblock",
+	"correlation","cosine","euclidean","hamming","sqeuclidean"])
+#Tested 4
+@staticmethod
+def funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies=None):
+"""
+Calculates the Simpsons diversity index as defined as sum(Pi*Pi).
+Note***: Assumes that the abundance measurements are already normalized by the total population N.
+:param	ldSampleTaxaAbundancies:	List of measurements to calculate metric on (a sample).
+:type:	List of doubles
+:return	Double:	Diversity metric
+"""
+#Calculate metric
+return sum((ldSampleTaxaAbundancies)*(ldSampleTaxaAbundancies))
+#Tested 4
+@staticmethod
+def funcGetInverseSimpsonsDiversityIndex(ldSampleTaxaAbundancies=None):
+"""
+Calculates Inverse Simpsons diversity index 1/sum(Pi*Pi).
+This is multiplicative inverse which reverses the order of the simpsons diversity index.
+Note***: Assumes that the abundance measurements are already normalized by the total population N.
+:param	ldSampleTaxaAbundancies:	List of measurements to calculate metric on (a sample).
+:type:	List of doubles
+:return	Double:	Diversity metric
+"""
+simpsons = Metric.funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies)
+#If simpsons is false return false, else return inverse
+if not simpsons:
+return False
+return 1.0/simpsons
+#Tested 4
+@staticmethod
+def funcGetShannonRichnessIndex(ldSampleTaxaAbundancies=None):
+"""
+Calculates the Shannon richness index.
+Note***: Assumes that the abundance measurements are already normalized by the total population N.
+If not normalized, include N in the parameter tempTotalN and it will be.
+	This is in base exp(1) like the default R Vegan package. Cogent is by defaul in bits (base=2)
+	Both options are here for your use. See Metric.funcGetAlphaDiversity() to access cogent
+:param	ldSampleTaxaAbundancies:	List of measurements to calculate metric on (a sample).
+:type:	List of doubles
+:return	Double:	Richness metric
+"""
+#Calculate metric
+ldSampleTaxaAbundancies = ldSampleTaxaAbundancies[np.where(ldSampleTaxaAbundancies != 0)]
+tempIntermediateNumber = sum(ldSampleTaxaAbundancies*(np.log(ldSampleTaxaAbundancies)))
+if(tempIntermediateNumber == 0.0):
+return 0.0
+return -1 * tempIntermediateNumber
+#Test 3
+@staticmethod
+def funcGetChao1DiversityIndex(ldSampleTaxaAbundancies=None, fCorrectForBias=False):
+"""
+Calculates the Chao1 diversity index.
+Note***: Not normalized by abundance.
+:param	ldSampleTaxaAbundancies:	List of measurements to calculate metric on (a sample).
+:type:	List of doubles
+:param	fCorrectForBias:	Indicator to use bias correction.
+:type:	Boolean	False indicates uncorrected for bias (uncorrected = Chao 1984, corrected = Chao 1987, Eq. 2)
+:return	Double:	Diversity metric
+"""
+#If not counts return false
+if [num for num in ldSampleTaxaAbundancies if((num<1) and (not num==0))]: return False
+#Observed = total number of species observed in all samples pooled
+totalObservedSpecies = len(ldSampleTaxaAbundancies)-len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 0])
+#Singles = number of species that occur in exactly 1 sample
+singlesObserved = len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 1.0])
+#Doubles = number of species that occue in exactly 2 samples
+doublesObserved = len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 2.0])
+#If singles or doubles = 0, return observations so that a divided by zero error does not occur
+if((singlesObserved == 0) or (doublesObserved == 0)):
+return totalObservedSpecies
+#Calculate metric
+if fCorrectForBias:
+return cogent.maths.stats.alpha_diversity.chao1_bias_corrected(observed = totalObservedSpecies, singles = singlesObserved, doubles = doublesObserved)
+else:
+return cogent.maths.stats.alpha_diversity.chao1_uncorrected(observed = totalObservedSpecies, singles = singlesObserved, doubles = doublesObserved)
+#Test 3
+@staticmethod
+def funcGetObservedCount(ldSampleAbundances, dThreshold = 0.0):
+"""
+Count how many bugs / features have a value of greater than 0 or the threshold given.
+Expects a vector of abundances.
+****Do not normalize data if using the threshold.
+:param	ldSampleAbundances:	List of measurements to calculate metric on (a sample).
+:type:	List of doubles
+:param	dThreshold:	The lowest number the measurement can be to be counted as an observation.
+:type:	Double
+:return	Count:	Number of features observed in a sample.
+"""
+return sum([1 for observation in ldSampleAbundances if observation > dThreshold])
+#Test Cases 6
+@staticmethod
+def funcGetAlphaDiversity(liCounts,strMetric):
+"""
+Passes counts to cogent for an alpha diversity metric.
+	setAlphaDiversities are the names supported
+:param	liCount:	List of counts to calculate metric on (a sample).
+:type:	List of ints
+:return	Diversity:	Double diversity metric.
+"""
+return getattr(cogent.maths.stats.alpha_diversity,strMetric)(liCounts)
+#Happy path tested 1
+@staticmethod
+def funcGetDissimilarity(ldSampleTaxaAbundancies, funcDistanceFunction):
+"""
+Calculates the distance between samples given a function.
+If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order.
+condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)].
+Note***: Assumes that the abundance measurements are already normalized by the total population N.
+:param	ldSampleTaxaAbundancies:
+:type:	List of doubles
+:param	funcDistanceFunction: Distance function used to calculate distances
+:type:	Function
+:return	Double:	Dissimilarity metric
+"""
+#Calculate metric
+try:
+return scipy.spatial.distance.pdist(ldSampleTaxaAbundancies, funcDistanceFunction)
+except ValueError as error:
+print "".join(["Metric.funcGetDissimilarity. Error=",str(error)])
+return False
+#Test case 1
+@staticmethod
+def funcGetDissimilarityByName(ldSampleTaxaAbundancies, strMetric):
+"""
+Calculates beta-diversity metrics between lists of abundances
+	setBetaDiversities are the names supported
+:param	ldSampleTaxaAbundancies:
+:type:	List of doubles
+:param	strMetric: Name of the distance function used to calculate distances
+:type:	String
+:return	list double:	Dissimilarity metrics between each sample
+"""
+return scipy.spatial.distance.pdist(ldSampleTaxaAbundancies,strMetric)
+#Test 3
+@staticmethod
+def funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies):
+"""
+Calculates the BrayCurtis Beta dissimilarity index.
+d(u,v)=sum(abs(row1-row2))/sum(row1+row2).
+This is scale invariant.
+If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order.
+condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)].
+Note***: Assumes that the abundance measurements are already normalized by the total population N.
+:param	ldSampleTaxaAbundancies:
+:type:	List of doubles
+:return	Double Matrix:	Dissimilarity metric
+"""
+#Calculate metric
+try:
+return scipy.spatial.distance.pdist(X=ldSampleTaxaAbundancies, metric='braycurtis')
+except ValueError as error:
+print "".join(["Metric.getBrayCurtisDissimilarity. Error=",str(error)])
+return False
+#Test 3
+@staticmethod
+def funcGetInverseBrayCurtisDissimilarity(ldSampleTaxaAbundancies):
+"""
+Calculates 1 - the BrayCurtis Beta dissimilarity index.
+d(u,v)=1-(sum(abs(row1-row2))/sum(row1+row2)).
+This is scale invariant and ranges between 0 and 1.
+If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order.
+condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)].
+Note***: Assumes that the abundance measurements are already normalized by the total population N.
+:param	ldSampleTaxaAbundancies:	An np.array of samples (rows) x measurements (columns) in which distance is measured between rows
+:type:	List	List of doubles
+:return	Double Matrix:	1 - Bray-Curtis dissimilarity.
+"""
+bcValue = Metric.funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies = ldSampleTaxaAbundancies)
+if not type(bcValue) is BooleanType:
+return 1.0-bcValue
+return False
+#Test cases 8
+@staticmethod
+def funcGetUnifracDistance(istrmTree,istrmEnvr,lsSampleOrder=None,fWeighted=True):
+	"""
+	Gets a unifrac distance from files/filestreams.
+:param	istrmTree:	File path or stream which is a Newick format file
+:type:	String of file stream
+:param	istrmEnvr:	File path or stream which is a Newick format file
+:type:	String of file stream
+	"""
+	npaDist, lsSampleNames = fast_unifrac_file(open(istrmTree,"r") if isinstance(istrmTree, str) else istrmTree,
+			open(istrmEnvr,"r") if isinstance(istrmEnvr, str) else istrmEnvr, weighted=fWeighted).get("distance_matrix",False)
+#Was trying to avoid preallocating a matrix but if you only need a subset of the samples then it
+#is simpler to preallocate so this is what I am doing but making a condensed matrix and not a full matrix
+#Dictionary to translate the current order of the samples to what is expected if given an input order
+if lsSampleOrder:
+#{NewOrder:OriginalOrder} way to convert from old to new sample location
+dictTranslate = dict([[lsSampleOrder.index(sSampleName),lsSampleNames.index(sSampleName)] for sSampleName in lsSampleNames if sSampleName in lsSampleOrder])
+#Check to make sure all samples requested were found
+if not len(dictTranslate.keys()) == len(lsSampleOrder):
+print "Metric.funcGetUnifracDistance. Error= The some or all sample names given (lsSampleOrder) were not contained in the matrix."
+return False
+#Length of data
+iLengthOfData = len(lsSampleOrder)
+#Preallocate matrix and shuffle
+mtrxData = np.zeros(shape=(iLengthOfData,iLengthOfData))
+for x in xrange(iLengthOfData):
+for y in xrange(iLengthOfData):
+mtrxData[x,y] = npaDist[dictTranslate[x],dictTranslate[y]]
+npaDist = mtrxData
+lsSampleNames = lsSampleOrder
+#If no sample order is given, condense the matrix and return
+return (scipy.spatial.distance.squareform(npaDist),lsSampleNames)
+#Test 7
+@staticmethod
+def funcGetAlphaMetric(ldAbundancies, strMetric):
+"""
+Get alpha abundance of the metric for the vector.
+Note: Shannon is measured with base 2 ("shannon") or base exp(1) (Metric.c_strShannonRichness) depending which method is called.
+:param	ldAbundancies:	List of values to compute metric (a sample).
+:type:	List	List of doubles.
+:param	strMetric:	The metric to measure.
+:type:	String	Metric name (Use from constants above).
+:return	Double:	Metric specified by strMetric derived from ldAbundancies.
+"""
+if(strMetric == Metric.c_strShannonRichness):
+return Metric.funcGetShannonRichnessIndex(ldSampleTaxaAbundancies=ldAbundancies)
+elif(strMetric == Metric.c_strSimpsonDiversity):
+return Metric.funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies=ldAbundancies)
+elif(strMetric == Metric.c_strInvSimpsonDiversity):
+return Metric.funcGetInverseSimpsonsDiversityIndex(ldSampleTaxaAbundancies=ldAbundancies)
+elif(strMetric == Metric.c_strObservedCount):
+return Metric.funcGetObservedCount(ldSampleAbundances=ldAbundancies)
+#Chao1 Needs NOT Normalized Abundance (Counts)
+elif(strMetric == Metric.c_strChao1Diversity):
+return Metric.funcGetChao1DiversityIndex(ldSampleTaxaAbundancies=ldAbundancies)
+elif(strMetric in Metric.setAlphaDiversities):
+return Metric.funcGetAlphaDiversity(liCounts=ldAbundancies, strMetric=strMetric)
+else:
+return False
+#Test 5
+@staticmethod
+def funcBuildAlphaMetricsMatrix(npaSampleAbundance = None, lsSampleNames = None, lsDiversityMetricAlpha = None):
+"""
+Build a matrix of alpha diversity metrics for each sample
+Row = metric, column = sample
+:param	npaSampleAbundance:	Observations (Taxa (row) x sample (column))
+:type:	Numpy Array
+:param	lsSampleNames:	List of sample names of samples to measure (do not include the taxa id column name or other column names which should not be read).
+:type:	List of strings	Strings being samples to measure from the npaSampleAbundance.
+:param	lsDiversityMetricAlpha:	List of diversity metrics to use in measuring.
+:type:	List of strings	Strings being metrics to derived from the indicated samples.
+:return	List of List of doubles:	Each internal list is a list of (floats) indicating a specific metric measurement method measuring multiple samples
+[[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
+"""
+if not ValidateData.funcIsValidList(lsDiversityMetricAlpha):
+lsDiversityMetricAlpha = [lsDiversityMetricAlpha]
+#Get amount of metrics
+metricsCount = len(lsDiversityMetricAlpha)
+#Create return
+returnMetricsMatrixRet = [[] for index in lsDiversityMetricAlpha]
+#For each sample get all metrics
+#Place in list of lists
+#[[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
+for sample in lsSampleNames:
+sampleAbundance = npaSampleAbundance[sample]
+for metricIndex in xrange(0,metricsCount):
+returnMetricsMatrixRet[metricIndex].append(Metric.funcGetAlphaMetric(ldAbundancies = sampleAbundance, strMetric = lsDiversityMetricAlpha[metricIndex]))
+return returnMetricsMatrixRet
+#Testing 6 cases
+@staticmethod
+def funcGetBetaMetric(npadAbundancies=None, sMetric=None, istrmTree=None, istrmEnvr=None, lsSampleOrder=None, fAdditiveInverse = False):
+"""
+Takes a matrix of values and returns a beta metric matrix. The metric returned is indicated by name (sMetric).
+:param	npadAbundancies:	Numpy array of sample abundances to measure against.
+:type:	Numpy Array	Numpy array where row=samples and columns = features.
+:param	sMetric:	String name of beta metric. Possibilities are listed in microPITA.
+:type:	String	String name of beta metric. Possibilities are listed in microPITA.
+:return	Double:	Measurement indicated by metric for given abundance list
+"""
+if sMetric == Metric.c_strBrayCurtisDissimilarity:
+mtrxDistance = Metric.funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies=npadAbundancies)
+elif sMetric == Metric.c_strInvBrayCurtisDissimilarity:
+mtrxDistance = Metric.funcGetInverseBrayCurtisDissimilarity(ldSampleTaxaAbundancies=npadAbundancies)
+elif sMetric in Metric.setBetaDiversities:
+mtrxDistance = Metric.funcGetDissimilarityByName(ldSampleTaxaAbundancies=npadAbundancies, strMetric=sMetric)
+elif sMetric == Metric.c_strUnifracUnweighted:
+mtrxDistance = Metric.funcGetUnifracDistance(istrmTree=istrmTree,istrmEnvr=istrmEnvr,lsSampleOrder=lsSampleOrder,fWeighted=False)
+#            mtrxDistance = xReturn[0] if not type(xReturn) is BooleanType else xReturn
+elif sMetric == Metric.c_strUnifracWeighted:
+mtrxDistance = Metric.funcGetUnifracDistance(istrmTree=istrmTree,istrmEnvr=istrmEnvr,lsSampleOrder=lsSampleOrder,fWeighted=True)
+#            mtrxDistance = xReturn[0] if not type(xReturn) is BooleanType else xReturn
+else:
+mtrxDistance = False
+if fAdditiveInverse and not type(mtrxDistance) is BooleanType:
+	    if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
+		mtrxDistance = (1.0 - mtrxDistance[0],mtrxDistance[1])
+	    else:
+mtrxDistance = 1.0 - mtrxDistance
+	return mtrxDistance
+#Test Cases 11
+@staticmethod
+def funcReadMatrixFile(istmMatrixFile, lsSampleOrder=None):
+	"""
+	Reads in a file with a precalculated beta-diversty matrix.
+	:param istmMatrixFile:	File with beta-diversity matrix
+	:type:	FileStream of String file path
+	"""
+#Read in data
+f = csv.reader(open(istmMatrixFile,"r") if isinstance(istmMatrixFile, str) else istmMatrixFile, delimiter=ConstantsBreadCrumbs.c_matrixFileDelim )
+#Get header
+try:
+lsHeader = f.next()
+except StopIteration:
+return (False,False)
+lsHeaderReducedToSamples = [sHeader for sHeader in lsHeader if sHeader in lsSampleOrder] if lsSampleOrder else lsHeader[1:]
+#If no sample ordering is given, set the ordering to what is in the file
+if not lsSampleOrder:
+	    lsSampleOrder = lsHeaderReducedToSamples
+#Preallocate matrix
+mtrxData = np.zeros(shape=(len(lsSampleOrder),len(lsSampleOrder)))
+#Make sure all samples requested are in the file
+if(not len(lsSampleOrder) == len(lsHeaderReducedToSamples)): return False
+	for lsLine in f:
+if lsLine[0] in lsSampleOrder:
+iRowIndex = lsSampleOrder.index(lsLine[0])
+for i in xrange(1,len(lsSampleOrder)):
+iColumnIndexComing = lsHeader.index(lsSampleOrder[i])
+iColumnIndexGoing = lsSampleOrder.index(lsSampleOrder[i])
+mtrxData[iRowIndex,iColumnIndexGoing] = lsLine[iColumnIndexComing]
+mtrxData[iColumnIndexGoing,iRowIndex] = lsLine[iColumnIndexComing]
+tpleMData = mtrxData.shape
+mtrxData = mtrxData if any(sum(ld)>0 for ld in mtrxData) or ((tpleMData[0]==1) and (tpleMData[1]==1)) else []
+return (mtrxData,lsSampleOrder)
+#Test cases 2
+@staticmethod
+def funcWriteMatrixFile(mtrxMatrix, ostmMatrixFile, lsSampleNames=None):
+"""
+Writes a square matrix to file.
+:param mtrxMatrix:	Matrix to write to file
+:type:	Numpy array
+:lsSampleNames:	The names of the samples in the order of the matrix
+:type:	List of strings
+:ostmBetaMatrixFile:	File to write to
+:type:	String or file stream
+"""
+if not sum(mtrxMatrix.shape)>0 or not ostmMatrixFile:
+return False
+#Check to make sure the sample names are the correct length
+tpleiShape = mtrxMatrix.shape
+if not lsSampleNames:
+lsSampleNames = range(tpleiShape[0])
+if not(len(lsSampleNames) == tpleiShape[0]):
+print "".join(["Metric.funcWriteMatrixFile. Error= Length of sample names ("+str(len(lsSampleNames))+") and matrix ("+str(mtrxMatrix.shape)+") not equal."])
+return False
+#Write to file
+ostmOut = csv.writer(open(ostmMatrixFile,"w") if isinstance(ostmMatrixFile,str) else ostmMatrixFile, delimiter=ConstantsBreadCrumbs.c_matrixFileDelim )
+#Add the additional space at the beginning of the sample names to represent the id row/column
+lsSampleNames = [""]+list(lsSampleNames)
+#Write header and each row to file
+ostmOut.writerow(lsSampleNames)
+[ostmOut.writerow([lsSampleNames[iIndex+1]]+mtrxMatrix[iIndex,].tolist()) for iIndex in xrange(tpleiShape[0])]
+return True

Mercurial > repos > george-weingart > micropita

comparison src/breadcrumbs/src/Metric.py @ 0:2f4f6f08c8c4 draft