Mercurial > repos > george-weingart > micropita

#!/usr/bin/env python
"""
Author: Timothy Tickle
Description: Class to Run analysis for the microPITA paper
"""

#####################################################################################
#Copyright (C) <2012>
#
#Permission is hereby granted, free of charge, to any person obtaining a copy of
#this software and associated documentation files (the "Software"), to deal in the
#Software without restriction, including without limitation the rights to use, copy,
#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
#and to permit persons to whom the Software is furnished to do so, subject to
#the following conditions:
#
#The above copyright notice and this permission notice shall be included in all copies
#or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#####################################################################################

__author__ = "Timothy Tickle"
__copyright__ = "Copyright 2012"
__credits__ = ["Timothy Tickle"]
__license__ = "MIT"
__maintainer__ = "Timothy Tickle"
__email__ = "ttickle@sph.harvard.edu"
__status__ = "Development"

import sys
import argparse
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
from src.breadcrumbs.src.AbundanceTable import AbundanceTable
from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs
from src.breadcrumbs.src.Metric import Metric
from src.breadcrumbs.src.KMedoids import Kmedoids
from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor
from src.breadcrumbs.src.SVM import SVM
from src.breadcrumbs.src.UtilityMath import UtilityMath

from src.ConstantsMicropita import ConstantsMicropita
import csv
import logging
import math
import mlpy
import numpy as np
import operator
import os
import random
import scipy.cluster.hierarchy as hcluster
import scipy.spatial.distance
from types import *

class MicroPITA:
	"""
	Selects samples from a first tier of a multi-tiered study to be used in a second tier.
	Different methods can be used for selection.
	The expected input is an abundance table (and potentially a text file of targeted features,
	if using the targeted features option). Output is a list of samples exhibiting the
	characteristics of interest.
	"""

	#Constants
	#Diversity metrics Alpha
	c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity
	c_strChao1Diversity = Metric.c_strChao1Diversity

	#Diversity metrics Beta
	c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity

	#Additive inverses of diversity metrics beta
	c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity

	#Technique Names
	ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C"

	#Targeted feature settings
	c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked
	c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance

	#Technique groupings
#	c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2]

	#Converts ecology metrics into standardized method selection names
	dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2}
#	dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity}
	dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative}
	dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme}

	#Linkage used in the Hierarchical clustering
	c_strHierarchicalClusterMethod = 'average'

####Group 1## Diversity
	#Testing: Happy path Testing (8)
	def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None):
		"""
		Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given
			it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample
			names associated with the indices.

		:param	lldMatrix:	List of lists [[value,value,value,value],[value,value,value,value]].
		:type:	List of lists	List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample.
		:param	lsSampleNames:	List of sample names positionally related (the same) to each list (Optional).
		:type:	List of strings	List of strings.
		:param	iTopAmount:	The amount of top measured samples (assumes the higher measurements are better).
		:type:	integer	Integer amount of sample names/ indices to return.
		:return	List:	List of samples to be selected.
		"""
		topRankListRet = []
		for rowMetrics in lldMatrix:
			#Create 2 d array to hold value and index and sort
			liIndexX = [rowMetrics,range(len(rowMetrics))]
			liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True)

			if lsSampleNames:
				topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]])
			else:
				topRankListRet.append(liIndexX[1][:iTopAmount])

		return topRankListRet

	####Group 2## Representative Dissimilarity
	#Testing: Happy path tested 1
	def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
		"""
		Gets centroid samples by k-medoids clustering of a given matrix.

		:param	npaMatrix:	Numpy array where row=features and columns=samples
		:type:	Numpy array	Abundance Data.
		:param	sMetric:	String name of beta metric used as the distance metric.
		:type:	String	String name of beta metric.
		:param	lsSampleNames:	The names of the sample
		:type:	List	List of strings
		:param	iNumberSamplesReturned:	Number of samples to return, each will be a centroid of a sample.
		:type:	Integer	Number of samples to return
		:return	List:	List of selected samples.
		:param	istmBetaMatrix: File with beta-diversity matrix
		:type:	File stream or file path string
		"""

		#Count of how many rows
		sampleCount = npaMatrix.shape[0]
		if iNumberSamplesReturned > sampleCount:
			logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".")
			return False

		#If the cluster count is equal to the sample count return all samples
		if sampleCount == iNumberSamplesReturned:
			return list(lsSampleNames)

		#Get distance matrix
		distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames)
		if type(distanceMatrix) is BooleanType:
			logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.")
			return False

		# Handle unifrac output
		if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
			distanceMatrix = distanceMatrix[0]

		#Log distance matrix
		logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric))

		distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True)

		#Create object to determine clusters/medoids
		medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance)
		#medoidsData includes(1d numpy array, medoids indexes;
		#			  1d numpy array, non-medoids indexes;
		#			  1d numpy array, cluster membership for non-medoids;
		#			  double, cost of configuration)
		#npaMatrix is samples x rows
		#Build a matrix of lists of indicies to pass to the distance matrix
		lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))]
		medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix))
		logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:")
		logging.debug(str(medoidsData))

		#If returning the same amount of clusters and samples
		#Return centroids
		selectedIndexes = medoidsData[0]
		return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)]

	####Group 3## Highest Dissimilarity
	#Testing: Happy path tested
	def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
		"""
		Select extreme samples from HClustering.

		:param	strBetaMetric:	The beta metric to use for distance matrix generation.
		:type:	String	The name of the beta metric to use.
		:param	npaAbundanceMatrix:	Numpy array where row=samples and columns=features.
		:type:	Numpy Array	Abundance data.
		:param	lsSampleNames:	The names of the sample.
		:type:	List	List of strings.
		:param	iSelectSampleCount:	Number of samples to select (return).
		:type:	Integer	Integer number of samples returned.
		:return	Samples:	List of samples.
		:param	istmBetaMatrix: File with beta-diversity matrix
		:type:	File stream or file path string
		"""

		#If they want all the sample count, return all sample names
		iSampleCount=len(npaAbundanceMatrix[:,0])
		if iSelectSampleCount==iSampleCount:
		  return lsSampleNames

		#Holds the samples to be returned
		lsReturnSamplesRet = []

		#Generate beta matrix
		#Returns condensed matrix
		tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True)

		if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
			tempDistanceMatrix = tempDistanceMatrix[0]

		if type(tempDistanceMatrix) is BooleanType:
			logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.")
			return False

		if istmBetaMatrix:
			tempDistanceMatrix = 1-tempDistanceMatrix

		#Feed beta matrix to linkage to cluster
		#Send condensed matrix
		linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod)

		#Extract cluster information from dendrogram
		#The linakge matrix is of the form
		#[[int1 int2 doube int3],...]
		#int1 and int1 are the paired samples indexed at 0 and up.
		#each list is an entry for a branch that is number starting with the first
		#list being sample count index + 1
		#each list is then named by an increment as they appear
		#this means that if a number is in the list and is = sample count or greater it is not
		#terminal and is instead a branch.
		#This method just takes the lowest metric measurement (highest distance pairs/clusters)
		#Works much better than the original technique
		#get total number of samples

		iCurrentSelectCount = 0
		for row in linkageMatrix:
			#Get nodes ofthe lowest pairing (so the furthest apart pair)
			iNode1 = int(row[0])
			iNode2 = int(row[1])
			#Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram
			#The branching in the dendrogram will start at the number of samples and increment higher.
			#Add each of the pair one at a time breaking when enough samples are selected.
			if iNode1<iSampleCount:
				lsReturnSamplesRet.append(lsSampleNames[iNode1])
				iCurrentSelectCount = iCurrentSelectCount + 1
			if iCurrentSelectCount == iSelectSampleCount:
				break
			if iNode2<iSampleCount:
				lsReturnSamplesRet.append(lsSampleNames[iNode2])
				iCurrentSelectCount = iCurrentSelectCount + 1
			if iCurrentSelectCount == iSelectSampleCount:
				break

		#Return selected samples
		return lsReturnSamplesRet

	####Group 4## Rank Average of user Defined Taxa
		#Testing: Happy Path Tested
	def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False):
		"""
		Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped.

		:param	abndTable:	Abundance Table to analyse
		:type:	AbundanceTable	Abundance Table
		:param	lsTargetedFeature:	String names
		:type:	list	list of string names of features (bugs) which are measured after ranking against the full sample
		:param  fRank:	Indicates to rank the abundance before getting the average abundance of the features (default false)
		:type:   boolean	Flag indicating ranking abundance before calculating average feature measurement (false= no ranking)
		:return	List of lists or boolean:	List of lists or False on error. One internal list per sample indicating the sample,
				feature average abundance or ranked abundance. Lists will already be sorted.
				For not Ranked [[sample,average abundance of selected feature,1]]
				For Ranked [[sample,average ranked abundance, average abundance of selected feature]]
				Error Returns false
		"""

		llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature)
		if not llAbundance:
			logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
			return False
		#Add a space for ranking if needed
		#Not ranked will be [[sSample,average abundance,1]]
		#(where 1 will not discriminant ties if used in later functions, so this generalizes)
		#Ranked will be [[sSample, average rank, average abundance]]
		llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance]
		#Rank if needed
		if fRank:
			abndRanked = abndTable.funcRankAbundance()
			if abndRanked == None:
				logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.")
				return False
			llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature)
			if not llRetRank:
				logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
				return False
			dictRanks = dict(llRetRank)
			llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance]

		#Sort first for ties and then for the main feature
 		if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity:
			llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank)
		if fRank:
			llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank)
		return llRetAbundance

	#Testing: Happy Path Tested
	def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]):
	  """
	  Selects samples with the highest ranks or abundance of targeted features.
	  If ranked, select the highest abundance for tie breaking

	  :param	abndMatrix:	Abundance table to analyse
	  :type:	AbundanceTable	Abundance table
	  :param	lsTargetedTaxa:	List of features
	  :type:	list	list of strings
	  :param	iSampleSelectionCount:	Number of samples to select
	  :type:	integer	integer
	  :param	sMethod:	Method to select targeted features
	  :type:	string	String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues)
	  :return	List of strings:	List of sample names which were selected
	  List of strings	Empty list is returned on an error.
	  """

	  #Check data
	  if(len(lsTargetedTaxa) < 1):
		logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.")
		return []

	  lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa,
	  	fRank=sMethod.lower() == self.c_strTargetedRanked.lower())
	  #If an error occured or the key word for the method was not recognized
	  if lsTargetedSamples == False:
		  logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.")
		  return []

	  #Select from results
	  return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]]

	####Group 5## Random
	#Testing: Happy path Tested
	def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0):
		"""
		Returns random sample names of the number given. No replacement.

		:param	lsSamples:	List of sample names
		:type:	list	list of strings
		:param	iNumberOfSamplesToReturn:	Number of samples to select
		:type:	integer	integer.
		:return	List:	List of selected samples (strings).
		"""

		#Input matrix sample count
		sampleCount = len(lsSamples)

		#Return the full matrix if they ask for a return matrix where length == original
		if(iNumberOfSamplesToReturn >= sampleCount):
			return lsSamples

		#Get the random indices for the sample (without replacement)
		liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn)

		#Create a boolean array of if indexes are to be included in the reduced array
                return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices]

	#Happy path tested (case 3)
	def funcGetAveragePopulation(self, abndTable, lfCompress):
		"""
		Get the average row per column in the abndtable.

		:param abndTable: AbundanceTable of data to be averaged
		:type: AbudanceTable
		:param lfCompress: List of boolean flags (false means to remove sample before averaging
		:type: List of floats
		:return List of doubles:
		"""
		if sum(lfCompress) == 0:
			return []

		#Get the average populations
		lAverageRet = []

		for sFeature in abndTable.funcGetAbundanceCopy():
			sFeature = list(sFeature)[1:]
			sFeature=np.compress(lfCompress,sFeature,axis=0)
			lAverageRet.append(sum(sFeature)/float(len(sFeature)))
		return lAverageRet

	#Happy path tested (2 cases)
	def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected):
		"""
		Given an abundance table and an average sample, this returns the distance of each sample
		(measured using brays-curtis dissimilarity) from the average.
		The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected
		(which is associated with the samples in the order of the samples in the abundance table;
		use abundancetable.funcGetSampleNames() to see the order if needed).

		:param abndTable: Abundance table holding the data to be analyzed.
		:type: AbundanceTable
		:param ldAverage: Average population (Average features of the abundance table of samples)
		:type: List of doubles which represent the average population
		:param lsSamples: These are the only samples used in the analysis
		:type: List of strings (sample ids)
		:param lfSelected: Samples to be included in the analysis
		:type: List of boolean (true means include)
		:return: List of distances (doubles)
		"""
		#Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists
		ldSelectedDistances = []

		for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]:
			#Get the sample measurements
			ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0])
		return ldSelectedDistances

	#Happy path tested (1 case)
	def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther):
		"""
		Get the distance of samples from one label from the average sample of not the label.
		Note: This assumes 2 classes.

		:param abndTable: Table of data to work out of.
		:type: Abundace Table
		:param lfGroupOfInterest: Boolean indicator of the sample being in the first group.
		:type: List of floats, true indicating an individual in the group of interest.
		:param lfGroupOther:	Boolean indicator of the sample being in the other group.
		:type:	List of floats, true indicating an individual in the
		:return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population]
		"""
		#Get all sample names
		lsAllSamples = abndTable.funcGetSampleNames()

		#Get average populations
		lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther)

		#Get the distance from the average of the other label (label 1)
		ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther,
			lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest)

		return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances)

	#Happy path tested (1 test case)
	def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest):
		"""
		Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group.
		An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group.

		:params  abndTable: Abundance of measurements
		:type: AbundanceTable
		:params iSelectionCount: The number of samples selected per sample.
		:type: Integer Integer greater than 0
		:params sLabel: ID of the metadata which is the supervised label
		:type: String
		:params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest.
		:type: String found in the abundance table metadata row indicated by sLabel.
		:return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]]
		"""

		lsMetadata = abndTable.funcGetMetadata(sLabel)
		#Other metadata values
		lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest))

		#Get boolean indicator of values of interest
		lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata]

                #Get the distances of the items of interest from the other metadata values
		dictDistanceAverages = {}
                for sOtherLabel in lsUniqueOtherValues:
			#Get boolean indicator of labels not of interest
			lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata]

			#Get the distances of data from two different groups to the average of the other
			ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther))

			for sKey in ldValueDistances:
				dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey]

		#Finish average by dividing by length of lsUniqueOtherValues
		ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages]

                #Sort to extract extremes
                ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1))

		#Get the closest and farthest distances
		ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount]
		ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:]

		#Remove the selected samples from the larger population of distances (better visualization)
		ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples]

		#Return discriminant tuples, distinct tuples, other tuples
		return [ltupleDiscriminantSamples, ltupleDistinctSamples,
			   [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]]

	#Run the supervised method surrounding distance from centroids
	#Happy path tested (3 test cases)
	def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant,
						xOutputSupFile, xPredictSupFile, strSupervisedMetadata,
						iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False):
		"""
		Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group.

		:param	abundanceTable:	AbundanceTable
		:type:	AbudanceTable	Data to analyze
		:param	fRunDistinct:	Run distinct selection method
		:type:	Boolean	boolean (true runs method)
		:param	fRunDiscriminant:	Run discriminant method
		:type:	Boolean	boolean (true runs method)
		:param	xOutputSupFile:	File output from supervised methods detailing data going into the method.
		:type:	String or FileStream
		:param	xPredictSupFile:	File output from supervised methods distance results from supervised methods.
		:type:	String or FileStream
		:param strSupervisedMetadata:	The metadata that will be used to group samples.
		:type:	String
		:param	iSampleSupSelectionCount:	Number of samples to select
		:type:	Integer	int sample selection count
		:param lsOriginalSampleNames:	List of the sample names, order is important and should be preserved from the abundanceTable.
		:type:	List of samples
		:param	fAppendFiles:	Indicates that output files already exist and appending is occuring.
		:type:	Boolean
		:return	Selected Samples:	A dictionary of selected samples by selection ID
		Dictionary	{"Selection Method":["SampleID","SampleID"...]}
		"""
		#Get labels and run one label against many
		lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata)
		dictlltpleDistanceMeasurements = {}
		for sMetadataValue in set(lstrMetadata):

			#For now perform the selection here for the label of interest against the other labels
			dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable,
				iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue))

		#Make expected output files for supervised methods
		#1. Output file which is similar to an input file for SVMs
		#2. Output file that is similar to the probabilitic output of a SVM (LibSVM)
		#Manly for making output of supervised methods (Distance from Centroid) similar
		#MicropitaVis needs some of these files
		if xOutputSupFile:
			if fAppendFiles:
				SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
					lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
			else:
				SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
					sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)

		#Will contain the samples selected to return
		#One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type
		dictSelectedSamplesRet = dict()
		for sKey, ltplDistances in dictlltpleDistanceMeasurements.items():
			if fRunDistinct:
				dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]])
			if fRunDiscriminant:
				dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]])

		if xPredictSupFile:
			dictFlattenedDistances = dict()
			[dictFlattenedDistances.setdefault(sKey, []).append(tple)
				for sKey, lltple in dictlltpleDistanceMeasurements.items()
				for ltple in lltple for tple in ltple]
			if fAppendFiles:
				self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
					dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
			else:
				self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
					dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
		return dictSelectedSamplesRet

	#Two happy path test cases
	def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames):
		"""
		Manages updating the predict file.

		:param	xPredictSupFile: File that has predictions (distances) from the supervised method.
		:type:	FileStream or String file path
		:param	xInputLabelsFile: File that as input to the supervised methods.
		:type:	FileStream or String file path
		:param	dictltpleDistanceMeasurements:
		:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
		"""

		if not isinstance(xPredictSupFile, str):
			xPredictSupFile.close()
			xPredictSupFile = xPredictSupFile.name
		csvr = open(xPredictSupFile,'r')

		f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
		lsHeader = f.next()[1:]
		dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader])

		#Read data in
		iSampleIndex = 0
		for sRow in f:
			sLabel = sRow[0]
			[dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:])
				if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue]
			iSampleIndex += 1

		#Combine dictltpleDistanceMeasurements with new data
		#If they share a key then merge keeping parameter data
		#If they do not share the key, keep the full data
		dictNew = {}
		for sKey in dictltpleDistanceMeasurements.keys():
			lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]]
			dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey]
                for sKey in dictlltpleRead:
			if sKey not in dictltpleDistanceMeasurements.keys():
				dictNew[sKey] = dictlltpleRead[sKey]

		#Call writer
		self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile,
			dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable,
			lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True)

	#2 happy path test cases
        def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False):
		"""
		Write to the predict file.

		:param	xPredictSupFile: File that has predictions (distances) from the supervised method.
		:type:	FileStream or String file path
		:param	xInputLabelsFile: File that as input to the supervised methods.
		:type:	FileStream or String file path
		:param	dictltpleDistanceMeasurements:
		:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
		:param	abundanceTable: An abundance table of the sample data.
		:type:	AbundanceTable
		:param	lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing.
			Otherwise will use the sample names from the abundance table.
		:type:	List of strings
		:param	fFromUpdate:	Indicates if this is part of an update to the file or not.
		:type:	Boolean
		"""

		xInputLabelsFileName = xInputLabelsFile
		if not isinstance(xInputLabelsFile,str):
			xInputLabelsFileName = xInputLabelsFile.name
		f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)

		lsAllSampleNames = abundanceTable.funcGetSampleNames()
		lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames,
						isPredictFile=False)
		dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]])

		#Dictionay keys will be used to order the predict file
		lsMeasurementKeys = dictltpleDistanceMeasurements.keys()
		#Make header
		f.writerow(["labels"]+lsMeasurementKeys)

		#Reformat dictionary to make it easier to use
		for sKey in dictltpleDistanceMeasurements:
			dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]])

		for sSample in lsOriginalSampleNames:
			#Make body of file
			f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+
				[str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue))
				for sKey in lsMeasurementKeys])

	def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics,
												fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None,
												istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False):
		"""
		Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other
		for the set that should be normalized.

		:param	abndData:	Abundance table object holding the samples to be measured.
		:type:	AbundanceTable
		:param	iSampleSelectionCount	The number of samples to select per method.
		:type:	Integer
		:param	dictSelectedSamples	Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}.
		:type:	Dictionary
		:param	lsAlphaMetrics:	List of alpha metrics to use on alpha metric dependent assays (like highest diversity).
		:type:	List of strings
		:param	lsBetaMetrics:	List of beta metrics to use on beta metric dependent assays (like most representative).
		:type:	List of strings
		:param	lsInverseBetaMetrics:	List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar).
		:type:	List of strings
		:param	fRunDiversity:	Run Diversity based methods (true indicates run).
		:type:	Boolean
		:param	fRunRepresentative:	Run Representative based methods (true indicates run).
		:type:	Boolean
		:param	fRunExtreme:	Run Extreme based methods (true indicates run).
		:type:	Boolean
		:param	istmBetaMatrix:	File that has a precalculated beta matrix
		:type:	File stream or File path string
		:return	Selected Samples:	Samples selected by methods.
				Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
		"""

		#Sample ids/names
		lsSampleNames = abndData.funcGetSampleNames()

		#Generate alpha metrics and get most diverse
		if fRunDiversity:

			#Get Alpha metrics matrix
			internalAlphaMatrix = None
			#Name of technique
			strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics

			#If given an alpha-diversity metadata
			if strAlphaMetadata:
				internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]]
			else:
				#Expects Observations (Taxa (row) x sample (column))
				#Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
				internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy()
							if not abndData.funcIsSummed()
							else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(),
							lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics)

			if internalAlphaMatrix:
				#Invert measurments
				if fInvertDiversity:
					lldNewDiversity = []
					for lsLine in internalAlphaMatrix:
						lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine])
					internalAlphaMatrix = lldNewDiversity
				#Get top ranked alpha diversity by most diverse
				#Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...]
				#Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]]
				mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount)

				#Add to results
				for index in xrange(0,len(strMethod)):
					strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index])
					dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index])

		logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b")
		logging.info(dictSelectedSamples)

		#Generate beta metrics and
		if fRunRepresentative or fRunExtreme:

			#Abundance matrix transposed
			npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True)

			#Get center selection using clusters/tiling
			#This will be for beta metrics in normalized space
			if fRunRepresentative:

				if istmBetaMatrix:
					#Get representative dissimilarity samples
					medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)

					if medoidSamples:
						dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples)
				else:
					logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.")
					for bMetric in lsBetaMetrics:

						#Get representative dissimilarity samples
						medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)

						if medoidSamples:
							dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples)

			#Get extreme selection using clusters, tiling
			if fRunExtreme:
				logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.")
				if istmBetaMatrix:

					#Samples for representative dissimilarity
					#This involves inverting the distance metric,
					#Taking the dendrogram level of where the number cluster == the number of samples to select
					#Returning a repersentative sample from each cluster
					extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)

					#Add selected samples
					if extremeSamples:
						dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples)

				else:
					#Run KMedoids with inverse custom distance metric in normalized space
					for bMetric in lsInverseBetaMetrics:

						#Samples for representative dissimilarity
						#This involves inverting the distance metric,
						#Taking the dendrogram level of where the number cluster == the number of samples to select
						#Returning a repersentative sample from each cluster
						extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)

						#Add selected samples
						if extremeSamples:
							dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples)

		logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b")
		logging.info(dictSelectedSamples)
		return dictSelectedSamples

	def funcRun(self, strIDName, strLastMetadataName, istmInput,
					  ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput,
					  cDelimiter, cFeatureNameDelimiter, strFeatureSelection,
					  istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None,
					  strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None,
					  iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False):
		"""
		Manages the selection of samples given different metrics.

		:param	strIDName: Sample Id metadata row
		:type:	String
		:param	strLastMetadataName: The id of the metadata positioned last in the abundance table.
		:type:	String	String metadata id.
		:param	istmInput: File to store input data to supervised methods.
		:type:	FileStream of String file path
		:param	ostmInputPredictFile: File to store distances from supervised methods.
		:type:	FileStream or String file path
		:param	ostmCheckedFile: File to store the AbundanceTable data after it is being checked.
		:type:	FileStream or String file path
		:param	ostmOutPut: File to store sample selection by methods of interest.
		:type:	FileStream or String file path
		:param	cDelimiter: Delimiter of abundance table.
		:type:	Character Char (default TAB).
		:param	cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades).
		:type:	Character (default |).
		:param	stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance).
		:type:	String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues).
		:param	istmFeatures: File which holds the features of interest if using targeted feature methodology.
		:type:	FileStream or String file path
		:param	iCount:	Number of samples to select in each methods, supervised methods select this amount per label if possible.
		:type:	Integer	integer.
		:param	lstrMethods: List of strings indicating selection techniques.
		:type:	List of string method names
		:param	strLabel: The metadata used for supervised labels.
		:type:	String
		:param	strStratify: The metadata used to stratify unsupervised data.
		:type:	String
		:param	strCustomAlpha: Custom alpha diversity metric
		:type:	String
		:param	strCustomBeta: Custom beta diversity metric
		:type:	String
		:param	strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling
		:type:	String
		:param	istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling
		:type:	FileStream or String file path
		:param	istrmTree: File containing tree for phylogentic beta-diversity analysis
		:type:	FileStream or String file path
		:param	istrmEnvr: File containing environment for phylogentic beta-diversity analysis
		:type:	FileStream or String file path
		:param	iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples.
		:type:	Integer
		:param	iMinSamples: Minimum sample count for the occurence filter.
		:type:	Integer
		:param	fInvertDiversity: When true will invert diversity measurements before using.
		:type:	boolean
		:return	Selected Samples:	Samples selected by methods.
				Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
		"""

		#Holds the top ranked samples from different metrics
		#dict[metric name] = [samplename,samplename...]
		selectedSamples = dict()

		#If a target feature file is given make sure that targeted feature is in the selection methods, if not add
		if ConstantsMicropita.c_strFeature in lstrMethods:
		  if not istmFeatures:
			logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.")
			return False

		#Diversity metrics to run
		#Use custom metrics if specified
                #Custom beta metrics set to normalized only, custom alpha metrics set to count only
		diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity]
		diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity]
#		inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity]
		diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else []
		diversityMetricsBetaNoNormalize = []
#		inverseDiversityMetricsBetaNoNormalize = []

		#Targeted taxa
		userDefinedTaxa = []

		#Perform different flows flags
		c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods
		c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods
		c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods
		c_RUN_RANK_AVERAGE_USER_4 = False
		if ConstantsMicropita.c_strFeature in lstrMethods:
			c_RUN_RANK_AVERAGE_USER_4 = True
			if not istmFeatures:
				logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.")
				return False
			#Read in taxa list, break down to lines and filter out empty strings
			userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines()))
		c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods
		c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods
		c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods

		#Read in abundance data
		#Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0
		#Abundance table object to read in and manage data
		totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples],
								cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata,
								sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile)
		if not totalAbundanceTable:
			logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+
				" This often occurs when the Last Metadata is not specified correctly."+
				" Please check to make sure the Last Metadata selection is the row of the last metadata,"+
				" all values after this selection should be microbial measurements and should be numeric.")
			return False

		lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel

		dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy()
		logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata))
		#If there is only 1 unique value for the labels, do not run the Supervised methods
		if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ):
			logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[])))
			return False

		#Run unsupervised methods###
		#Stratify the data if need be and drop the old data
		lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable]

		#For each stratified abundance block or for the unstratfified abundance
		#Run the unsupervised blocks
		fAppendSupFiles = False
		for stratAbundanceTable in lStratifiedAbundanceTables:
			logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName())

 			###NOT SUMMED, NOT NORMALIZED
			#Only perform if the data is not yet normalized
			if not stratAbundanceTable.funcIsNormalized( ):
				#Need to first work with unnormalized data
				if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:

					self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
													 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize,
													 lsBetaMetrics=diversityMetricsBetaNoNormalize,
													 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize,
													 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
													 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata,
                                                                                                         istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)


			#Generate selection by the rank average of user defined taxa
			#Expects (Taxa (row) by Samples (column))
			#Expects a column 0 of taxa id that is skipped
			#Returns [(sample name,average,rank)]
			#SUMMED AND NORMALIZED
			stratAbundanceTable.funcSumClades()
			#Normalize data at this point
			stratAbundanceTable.funcNormalize()
			if c_RUN_RANK_AVERAGE_USER_4:
				selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable,
						lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection)
				logging.info("MicroPITA.funcRun:: Selected Samples Rank")
				logging.info(selectedSamples)

 			###SUMMED AND NORMALIZED analysis block
			#Diversity based metric will move reduce to terminal taxa as needed
			if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:

				self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
												 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha,
												 lsBetaMetrics=diversityMetricsBeta,
												 lsInverseBetaMetrics=diversityMetricsBeta,
												 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
												 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3,
                                                                                                 istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)

			#5::Select randomly
			#Expects sampleNames = List of sample names [name, name, name...]
			if(c_RUN_RANDOM_5):
				#Select randomly from sample names
				selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount)
				logging.info("MicroPITA.funcRun:: Selected Samples Random")
				logging.info(selectedSamples)

			#Perform supervised selection
			if c_RUN_DISTINCT or c_RUN_DISCRIMINANT:
 				if strLabel:
					dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable,
								fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT,
								xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile,
								strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount,
								lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(),
								lsOriginalLabels = lsOriginalLabels,
								fAppendFiles=fAppendSupFiles)

					[selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()]

					if not fAppendSupFiles:
						fAppendSupFiles = True
					logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised")
					logging.info(selectedSamples)
		return selectedSamples

	#Testing: Happy path tested
	@staticmethod
	def funcWriteSelectionToFile(dictSelection,xOutputFilePath):
		"""
		Writes the selection of samples by method to an output file.

		:param	dictSelection:	The dictionary of selections by method to be written to a file.
		:type:	Dictionary	The dictionary of selections by method {"method":["sample selected","sample selected"...]}
		:param	xOutputFilePath:	FileStream or String path to file inwhich the dictionary is written.
		:type:	String	FileStream or String path to file
		"""

		if not dictSelection:
			return

		#Open file
		f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim )

		#Create output content from dictionary
		for sKey in dictSelection:
			f.writerow([sKey]+dictSelection[sKey])
			logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey]))

	#Testing: Happy Path tested
	@staticmethod
	def funcReadSelectionFileToDictionary(xInputFile):
		"""
		Reads in an output selection file from micropita and formats it into a dictionary.

		:param	xInputFile:	String path to file or file stream to read and translate into a dictionary.
									{"method":["sample selected","sample selected"...]}
		:type:	FileStream or String Path to file
		:return	Dictionary:	Samples selected by methods.
					Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
		"""

		#Open file
		istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim)

		#Dictionary to hold selection data
		return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader])

#Set up arguments reader
argp = argparse.ArgumentParser( prog = "MicroPITA.py",
	description = """Selects samples from abundance tables based on various selection schemes.""" )

args = argp.add_argument_group( "Common", "Commonly modified options" )
args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp)
args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp,
	choices = ConstantsMicropita.c_lsAllMethods, action = "append")

args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" )
args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp,  choices = Metric.setAlphaDiversities)
args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp,  choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted])
args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp)
args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp)
args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp)
args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp)
args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp)

args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" )
args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp)
args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None,
				  help= ConstantsMicropita.c_strLastMetadataNameHelp)
args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0],
				  choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp)
args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp)
args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp)

args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" )
args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp)
args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id",
				  help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp)

args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" )
args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp)
args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp)

args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" )
args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING",
				  choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp)
args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp)
args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp)
args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp)
args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp)

argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp,
	default = sys.stdin)
argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp,
	default = sys.stdout)

__doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__

def _main( ):
	args = argp.parse_args( )

	#Set up logger
	iLogLevel = getattr(logging, args.strLogLevel.upper(), None)
	logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel)

	#Run micropita
	logging.info("MicroPITA:: Start microPITA")
	microPITA = MicroPITA()

	#Argparse will append to the default but will not remove the default so I do this here
	if not len(args.lstrMethods):
		args.lstrMethods = [ConstantsMicropita.c_strRepresentative]

	dictSelectedSamples = microPITA.funcRun(
		strIDName		= args.strIDName,
		strLastMetadataName	= args.strLastMetadataName,
		istmInput		= args.istmInput,
		ostmInputPredictFile	= args.ostmInputPredictFile,
		ostmPredictFile		= args.ostmPredictFile,
		ostmCheckedFile		= args.ostmCheckedFile,
		ostmOutput		= args.ostmOutput,
		cDelimiter		= args.cFileDelimiter,
		cFeatureNameDelimiter	= args.cFeatureNameDelimiter,
		istmFeatures		= args.istmFeatures,
		strFeatureSelection	= args.strFeatureSelection,
		iCount			= args.iCount,
		strLastRowMetadata	= args.strLastFeatureMetadata,
		strLabel		= args.strLabel,
		strStratify		= args.strUnsupervisedStratify,
		strCustomAlpha		= args.strAlphaDiversity,
		strCustomBeta		= args.strBetaDiversity,
		strAlphaMetadata	= args.strAlphaMetadata,
		istmBetaMatrix		= args.istmBetaMatrix,
		istrmTree		= args.istrmTree,
		istrmEnvr		= args.istrmEnvr,
		lstrMethods		= args.lstrMethods,
		fInvertDiversity	= args.fInvertDiversity
	)

	if not dictSelectedSamples:
		logging.error("MicroPITA:: Error, did not get a result from analysis.")
		return -1
	logging.info("End microPITA")

	#Log output for debugging
	logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples))

	#Write selection to file
	microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput)

if __name__ == "__main__":
	_main( )
author	george-weingart
date	Tue, 30 Aug 2016 13:10:34 -0400
parents	cd71e90abfab
children