0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Author: Timothy Tickle
|
|
4 Description: Class to Run analysis for the microPITA paper
|
|
5 """
|
|
6
|
|
7 #####################################################################################
|
|
8 #Copyright (C) <2012>
|
|
9 #
|
|
10 #Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
11 #this software and associated documentation files (the "Software"), to deal in the
|
|
12 #Software without restriction, including without limitation the rights to use, copy,
|
|
13 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
14 #and to permit persons to whom the Software is furnished to do so, subject to
|
|
15 #the following conditions:
|
|
16 #
|
|
17 #The above copyright notice and this permission notice shall be included in all copies
|
|
18 #or substantial portions of the Software.
|
|
19 #
|
|
20 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
|
21 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
|
22 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
23 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
24 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
25 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
26 #####################################################################################
|
|
27
|
|
28 __author__ = "Timothy Tickle"
|
|
29 __copyright__ = "Copyright 2012"
|
|
30 __credits__ = ["Timothy Tickle"]
|
|
31 __license__ = "MIT"
|
|
32 __maintainer__ = "Timothy Tickle"
|
|
33 __email__ = "ttickle@sph.harvard.edu"
|
|
34 __status__ = "Development"
|
|
35
|
|
36 import sys
|
|
37 import argparse
|
|
38 from src.breadcrumbs.src.AbundanceTable import AbundanceTable
|
|
39 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs
|
|
40 from src.breadcrumbs.src.Metric import Metric
|
|
41 from src.breadcrumbs.src.KMedoids import Kmedoids
|
|
42 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor
|
|
43 from src.breadcrumbs.src.SVM import SVM
|
|
44 from src.breadcrumbs.src.UtilityMath import UtilityMath
|
|
45
|
|
46 from src.ConstantsMicropita import ConstantsMicropita
|
|
47 import csv
|
|
48 import logging
|
|
49 import math
|
|
50 import mlpy
|
|
51 import numpy as np
|
|
52 import operator
|
|
53 import os
|
|
54 import random
|
|
55 import scipy.cluster.hierarchy as hcluster
|
|
56 import scipy.spatial.distance
|
|
57 from types import *
|
|
58
|
|
59 class MicroPITA:
|
|
60 """
|
|
61 Selects samples from a first tier of a multi-tiered study to be used in a second tier.
|
|
62 Different methods can be used for selection.
|
|
63 The expected input is an abundance table (and potentially a text file of targeted features,
|
|
64 if using the targeted features option). Output is a list of samples exhibiting the
|
|
65 characteristics of interest.
|
|
66 """
|
|
67
|
|
68 #Constants
|
|
69 #Diversity metrics Alpha
|
|
70 c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity
|
|
71 c_strChao1Diversity = Metric.c_strChao1Diversity
|
|
72
|
|
73 #Diversity metrics Beta
|
|
74 c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity
|
|
75
|
|
76 #Additive inverses of diversity metrics beta
|
|
77 c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity
|
|
78
|
|
79 #Technique Names
|
|
80 ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C"
|
|
81
|
|
82 #Targeted feature settings
|
|
83 c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked
|
|
84 c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance
|
|
85
|
|
86 #Technique groupings
|
|
87 # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2]
|
|
88
|
|
89 #Converts ecology metrics into standardized method selection names
|
|
90 dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2}
|
|
91 # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity}
|
|
92 dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative}
|
|
93 dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme}
|
|
94
|
|
95 #Linkage used in the Hierarchical clustering
|
|
96 c_strHierarchicalClusterMethod = 'average'
|
|
97
|
|
98 ####Group 1## Diversity
|
|
99 #Testing: Happy path Testing (8)
|
|
100 def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None):
|
|
101 """
|
|
102 Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given
|
|
103 it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample
|
|
104 names associated with the indices.
|
|
105
|
|
106 :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]].
|
|
107 :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample.
|
|
108 :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional).
|
|
109 :type: List of strings List of strings.
|
|
110 :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better).
|
|
111 :type: integer Integer amount of sample names/ indices to return.
|
|
112 :return List: List of samples to be selected.
|
|
113 """
|
|
114 topRankListRet = []
|
|
115 for rowMetrics in lldMatrix:
|
|
116 #Create 2 d array to hold value and index and sort
|
|
117 liIndexX = [rowMetrics,range(len(rowMetrics))]
|
|
118 liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True)
|
|
119
|
|
120 if lsSampleNames:
|
|
121 topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]])
|
|
122 else:
|
|
123 topRankListRet.append(liIndexX[1][:iTopAmount])
|
|
124
|
|
125 return topRankListRet
|
|
126
|
|
127 ####Group 2## Representative Dissimilarity
|
|
128 #Testing: Happy path tested 1
|
|
129 def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
|
|
130 """
|
|
131 Gets centroid samples by k-medoids clustering of a given matrix.
|
|
132
|
|
133 :param npaMatrix: Numpy array where row=features and columns=samples
|
|
134 :type: Numpy array Abundance Data.
|
|
135 :param sMetric: String name of beta metric used as the distance metric.
|
|
136 :type: String String name of beta metric.
|
|
137 :param lsSampleNames: The names of the sample
|
|
138 :type: List List of strings
|
|
139 :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample.
|
|
140 :type: Integer Number of samples to return
|
|
141 :return List: List of selected samples.
|
|
142 :param istmBetaMatrix: File with beta-diversity matrix
|
|
143 :type: File stream or file path string
|
|
144 """
|
|
145
|
|
146 #Count of how many rows
|
|
147 sampleCount = npaMatrix.shape[0]
|
|
148 if iNumberSamplesReturned > sampleCount:
|
|
149 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".")
|
|
150 return False
|
|
151
|
|
152 #If the cluster count is equal to the sample count return all samples
|
|
153 if sampleCount == iNumberSamplesReturned:
|
|
154 return list(lsSampleNames)
|
|
155
|
|
156 #Get distance matrix
|
|
157 distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames)
|
|
158 if type(distanceMatrix) is BooleanType:
|
|
159 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.")
|
|
160 return False
|
|
161
|
|
162 # Handle unifrac output
|
|
163 if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
|
|
164 distanceMatrix = distanceMatrix[0]
|
|
165
|
|
166 #Log distance matrix
|
|
167 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric))
|
|
168
|
|
169 distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True)
|
|
170
|
|
171 #Create object to determine clusters/medoids
|
|
172 medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance)
|
|
173 #medoidsData includes(1d numpy array, medoids indexes;
|
|
174 # 1d numpy array, non-medoids indexes;
|
|
175 # 1d numpy array, cluster membership for non-medoids;
|
|
176 # double, cost of configuration)
|
|
177 #npaMatrix is samples x rows
|
|
178 #Build a matrix of lists of indicies to pass to the distance matrix
|
|
179 lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))]
|
|
180 medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix))
|
|
181 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:")
|
|
182 logging.debug(str(medoidsData))
|
|
183
|
|
184 #If returning the same amount of clusters and samples
|
|
185 #Return centroids
|
|
186 selectedIndexes = medoidsData[0]
|
|
187 return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)]
|
|
188
|
|
189 ####Group 3## Highest Dissimilarity
|
|
190 #Testing: Happy path tested
|
|
191 def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
|
|
192 """
|
|
193 Select extreme samples from HClustering.
|
|
194
|
|
195 :param strBetaMetric: The beta metric to use for distance matrix generation.
|
|
196 :type: String The name of the beta metric to use.
|
|
197 :param npaAbundanceMatrix: Numpy array where row=samples and columns=features.
|
|
198 :type: Numpy Array Abundance data.
|
|
199 :param lsSampleNames: The names of the sample.
|
|
200 :type: List List of strings.
|
|
201 :param iSelectSampleCount: Number of samples to select (return).
|
|
202 :type: Integer Integer number of samples returned.
|
|
203 :return Samples: List of samples.
|
|
204 :param istmBetaMatrix: File with beta-diversity matrix
|
|
205 :type: File stream or file path string
|
|
206 """
|
|
207
|
|
208 #If they want all the sample count, return all sample names
|
|
209 iSampleCount=len(npaAbundanceMatrix[:,0])
|
|
210 if iSelectSampleCount==iSampleCount:
|
|
211 return lsSampleNames
|
|
212
|
|
213 #Holds the samples to be returned
|
|
214 lsReturnSamplesRet = []
|
|
215
|
|
216 #Generate beta matrix
|
|
217 #Returns condensed matrix
|
|
218 tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True)
|
|
219
|
|
220 if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
|
|
221 tempDistanceMatrix = tempDistanceMatrix[0]
|
|
222
|
|
223 if type(tempDistanceMatrix) is BooleanType:
|
|
224 logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.")
|
|
225 return False
|
|
226
|
|
227 if istmBetaMatrix:
|
|
228 tempDistanceMatrix = 1-tempDistanceMatrix
|
|
229
|
|
230 #Feed beta matrix to linkage to cluster
|
|
231 #Send condensed matrix
|
|
232 linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod)
|
|
233
|
|
234 #Extract cluster information from dendrogram
|
|
235 #The linakge matrix is of the form
|
|
236 #[[int1 int2 doube int3],...]
|
|
237 #int1 and int1 are the paired samples indexed at 0 and up.
|
|
238 #each list is an entry for a branch that is number starting with the first
|
|
239 #list being sample count index + 1
|
|
240 #each list is then named by an increment as they appear
|
|
241 #this means that if a number is in the list and is = sample count or greater it is not
|
|
242 #terminal and is instead a branch.
|
|
243 #This method just takes the lowest metric measurement (highest distance pairs/clusters)
|
|
244 #Works much better than the original technique
|
|
245 #get total number of samples
|
|
246
|
|
247 iCurrentSelectCount = 0
|
|
248 for row in linkageMatrix:
|
|
249 #Get nodes ofthe lowest pairing (so the furthest apart pair)
|
|
250 iNode1 = int(row[0])
|
|
251 iNode2 = int(row[1])
|
|
252 #Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram
|
|
253 #The branching in the dendrogram will start at the number of samples and increment higher.
|
|
254 #Add each of the pair one at a time breaking when enough samples are selected.
|
|
255 if iNode1<iSampleCount:
|
|
256 lsReturnSamplesRet.append(lsSampleNames[iNode1])
|
|
257 iCurrentSelectCount = iCurrentSelectCount + 1
|
|
258 if iCurrentSelectCount == iSelectSampleCount:
|
|
259 break
|
|
260 if iNode2<iSampleCount:
|
|
261 lsReturnSamplesRet.append(lsSampleNames[iNode2])
|
|
262 iCurrentSelectCount = iCurrentSelectCount + 1
|
|
263 if iCurrentSelectCount == iSelectSampleCount:
|
|
264 break
|
|
265
|
|
266 #Return selected samples
|
|
267 return lsReturnSamplesRet
|
|
268
|
|
269 ####Group 4## Rank Average of user Defined Taxa
|
|
270 #Testing: Happy Path Tested
|
|
271 def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False):
|
|
272 """
|
|
273 Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped.
|
|
274
|
|
275 :param abndTable: Abundance Table to analyse
|
|
276 :type: AbundanceTable Abundance Table
|
|
277 :param lsTargetedFeature: String names
|
|
278 :type: list list of string names of features (bugs) which are measured after ranking against the full sample
|
|
279 :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false)
|
|
280 :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking)
|
|
281 :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample,
|
|
282 feature average abundance or ranked abundance. Lists will already be sorted.
|
|
283 For not Ranked [[sample,average abundance of selected feature,1]]
|
|
284 For Ranked [[sample,average ranked abundance, average abundance of selected feature]]
|
|
285 Error Returns false
|
|
286 """
|
|
287
|
|
288 llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature)
|
|
289 if not llAbundance:
|
|
290 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
|
|
291 return False
|
|
292 #Add a space for ranking if needed
|
|
293 #Not ranked will be [[sSample,average abundance,1]]
|
|
294 #(where 1 will not discriminant ties if used in later functions, so this generalizes)
|
|
295 #Ranked will be [[sSample, average rank, average abundance]]
|
|
296 llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance]
|
|
297 #Rank if needed
|
|
298 if fRank:
|
|
299 abndRanked = abndTable.funcRankAbundance()
|
|
300 if abndRanked == None:
|
|
301 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.")
|
|
302 return False
|
|
303 llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature)
|
|
304 if not llRetRank:
|
|
305 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
|
|
306 return False
|
|
307 dictRanks = dict(llRetRank)
|
|
308 llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance]
|
|
309
|
|
310 #Sort first for ties and then for the main feature
|
|
311 if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity:
|
|
312 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank)
|
|
313 if fRank:
|
|
314 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank)
|
|
315 return llRetAbundance
|
|
316
|
|
317 #Testing: Happy Path Tested
|
|
318 def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]):
|
|
319 """
|
|
320 Selects samples with the highest ranks or abundance of targeted features.
|
|
321 If ranked, select the highest abundance for tie breaking
|
|
322
|
|
323 :param abndMatrix: Abundance table to analyse
|
|
324 :type: AbundanceTable Abundance table
|
|
325 :param lsTargetedTaxa: List of features
|
|
326 :type: list list of strings
|
|
327 :param iSampleSelectionCount: Number of samples to select
|
|
328 :type: integer integer
|
|
329 :param sMethod: Method to select targeted features
|
|
330 :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues)
|
|
331 :return List of strings: List of sample names which were selected
|
|
332 List of strings Empty list is returned on an error.
|
|
333 """
|
|
334
|
|
335 #Check data
|
|
336 if(len(lsTargetedTaxa) < 1):
|
|
337 logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.")
|
|
338 return []
|
|
339
|
|
340 lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa,
|
|
341 fRank=sMethod.lower() == self.c_strTargetedRanked.lower())
|
|
342 #If an error occured or the key word for the method was not recognized
|
|
343 if lsTargetedSamples == False:
|
|
344 logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.")
|
|
345 return []
|
|
346
|
|
347 #Select from results
|
|
348 return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]]
|
|
349
|
|
350 ####Group 5## Random
|
|
351 #Testing: Happy path Tested
|
|
352 def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0):
|
|
353 """
|
|
354 Returns random sample names of the number given. No replacement.
|
|
355
|
|
356 :param lsSamples: List of sample names
|
|
357 :type: list list of strings
|
|
358 :param iNumberOfSamplesToReturn: Number of samples to select
|
|
359 :type: integer integer.
|
|
360 :return List: List of selected samples (strings).
|
|
361 """
|
|
362
|
|
363 #Input matrix sample count
|
|
364 sampleCount = len(lsSamples)
|
|
365
|
|
366 #Return the full matrix if they ask for a return matrix where length == original
|
|
367 if(iNumberOfSamplesToReturn >= sampleCount):
|
|
368 return lsSamples
|
|
369
|
|
370 #Get the random indices for the sample (without replacement)
|
|
371 liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn)
|
|
372
|
|
373 #Create a boolean array of if indexes are to be included in the reduced array
|
|
374 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices]
|
|
375
|
|
376 #Happy path tested (case 3)
|
|
377 def funcGetAveragePopulation(self, abndTable, lfCompress):
|
|
378 """
|
|
379 Get the average row per column in the abndtable.
|
|
380
|
|
381 :param abndTable: AbundanceTable of data to be averaged
|
|
382 :type: AbudanceTable
|
|
383 :param lfCompress: List of boolean flags (false means to remove sample before averaging
|
|
384 :type: List of floats
|
|
385 :return List of doubles:
|
|
386 """
|
|
387 if sum(lfCompress) == 0:
|
|
388 return []
|
|
389
|
|
390 #Get the average populations
|
|
391 lAverageRet = []
|
|
392
|
|
393 for sFeature in abndTable.funcGetAbundanceCopy():
|
|
394 sFeature = list(sFeature)[1:]
|
|
395 sFeature=np.compress(lfCompress,sFeature,axis=0)
|
|
396 lAverageRet.append(sum(sFeature)/float(len(sFeature)))
|
|
397 return lAverageRet
|
|
398
|
|
399 #Happy path tested (2 cases)
|
|
400 def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected):
|
|
401 """
|
|
402 Given an abundance table and an average sample, this returns the distance of each sample
|
|
403 (measured using brays-curtis dissimilarity) from the average.
|
|
404 The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected
|
|
405 (which is associated with the samples in the order of the samples in the abundance table;
|
|
406 use abundancetable.funcGetSampleNames() to see the order if needed).
|
|
407
|
|
408 :param abndTable: Abundance table holding the data to be analyzed.
|
|
409 :type: AbundanceTable
|
|
410 :param ldAverage: Average population (Average features of the abundance table of samples)
|
|
411 :type: List of doubles which represent the average population
|
|
412 :param lsSamples: These are the only samples used in the analysis
|
|
413 :type: List of strings (sample ids)
|
|
414 :param lfSelected: Samples to be included in the analysis
|
|
415 :type: List of boolean (true means include)
|
|
416 :return: List of distances (doubles)
|
|
417 """
|
|
418 #Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists
|
|
419 ldSelectedDistances = []
|
|
420
|
|
421 for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]:
|
|
422 #Get the sample measurements
|
|
423 ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0])
|
|
424 return ldSelectedDistances
|
|
425
|
|
426 #Happy path tested (1 case)
|
|
427 def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther):
|
|
428 """
|
|
429 Get the distance of samples from one label from the average sample of not the label.
|
|
430 Note: This assumes 2 classes.
|
|
431
|
|
432 :param abndTable: Table of data to work out of.
|
|
433 :type: Abundace Table
|
|
434 :param lfGroupOfInterest: Boolean indicator of the sample being in the first group.
|
|
435 :type: List of floats, true indicating an individual in the group of interest.
|
|
436 :param lfGroupOther: Boolean indicator of the sample being in the other group.
|
|
437 :type: List of floats, true indicating an individual in the
|
|
438 :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population]
|
|
439 """
|
|
440 #Get all sample names
|
|
441 lsAllSamples = abndTable.funcGetSampleNames()
|
|
442
|
|
443 #Get average populations
|
|
444 lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther)
|
|
445
|
|
446 #Get the distance from the average of the other label (label 1)
|
|
447 ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther,
|
|
448 lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest)
|
|
449
|
|
450 return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances)
|
|
451
|
|
452 #Happy path tested (1 test case)
|
|
453 def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest):
|
|
454 """
|
|
455 Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group.
|
|
456 An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group.
|
|
457
|
|
458 :params abndTable: Abundance of measurements
|
|
459 :type: AbundanceTable
|
|
460 :params iSelectionCount: The number of samples selected per sample.
|
|
461 :type: Integer Integer greater than 0
|
|
462 :params sLabel: ID of the metadata which is the supervised label
|
|
463 :type: String
|
|
464 :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest.
|
|
465 :type: String found in the abundance table metadata row indicated by sLabel.
|
|
466 :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]]
|
|
467 """
|
|
468
|
|
469 lsMetadata = abndTable.funcGetMetadata(sLabel)
|
|
470 #Other metadata values
|
|
471 lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest))
|
|
472
|
|
473 #Get boolean indicator of values of interest
|
|
474 lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata]
|
|
475
|
|
476 #Get the distances of the items of interest from the other metadata values
|
|
477 dictDistanceAverages = {}
|
|
478 for sOtherLabel in lsUniqueOtherValues:
|
|
479 #Get boolean indicator of labels not of interest
|
|
480 lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata]
|
|
481
|
|
482 #Get the distances of data from two different groups to the average of the other
|
|
483 ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther))
|
|
484
|
|
485 for sKey in ldValueDistances:
|
|
486 dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey]
|
|
487
|
|
488 #Finish average by dividing by length of lsUniqueOtherValues
|
|
489 ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages]
|
|
490
|
|
491 #Sort to extract extremes
|
|
492 ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1))
|
|
493
|
|
494 #Get the closest and farthest distances
|
|
495 ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount]
|
|
496 ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:]
|
|
497
|
|
498 #Remove the selected samples from the larger population of distances (better visualization)
|
|
499 ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples]
|
|
500
|
|
501 #Return discriminant tuples, distinct tuples, other tuples
|
|
502 return [ltupleDiscriminantSamples, ltupleDistinctSamples,
|
|
503 [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]]
|
|
504
|
|
505 #Run the supervised method surrounding distance from centroids
|
|
506 #Happy path tested (3 test cases)
|
|
507 def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant,
|
|
508 xOutputSupFile, xPredictSupFile, strSupervisedMetadata,
|
|
509 iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False):
|
|
510 """
|
|
511 Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group.
|
|
512
|
|
513 :param abundanceTable: AbundanceTable
|
|
514 :type: AbudanceTable Data to analyze
|
|
515 :param fRunDistinct: Run distinct selection method
|
|
516 :type: Boolean boolean (true runs method)
|
|
517 :param fRunDiscriminant: Run discriminant method
|
|
518 :type: Boolean boolean (true runs method)
|
|
519 :param xOutputSupFile: File output from supervised methods detailing data going into the method.
|
|
520 :type: String or FileStream
|
|
521 :param xPredictSupFile: File output from supervised methods distance results from supervised methods.
|
|
522 :type: String or FileStream
|
|
523 :param strSupervisedMetadata: The metadata that will be used to group samples.
|
|
524 :type: String
|
|
525 :param iSampleSupSelectionCount: Number of samples to select
|
|
526 :type: Integer int sample selection count
|
|
527 :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable.
|
|
528 :type: List of samples
|
|
529 :param fAppendFiles: Indicates that output files already exist and appending is occuring.
|
|
530 :type: Boolean
|
|
531 :return Selected Samples: A dictionary of selected samples by selection ID
|
|
532 Dictionary {"Selection Method":["SampleID","SampleID"...]}
|
|
533 """
|
|
534 #Get labels and run one label against many
|
|
535 lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata)
|
|
536 dictlltpleDistanceMeasurements = {}
|
|
537 for sMetadataValue in set(lstrMetadata):
|
|
538
|
|
539 #For now perform the selection here for the label of interest against the other labels
|
|
540 dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable,
|
|
541 iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue))
|
|
542
|
|
543 #Make expected output files for supervised methods
|
|
544 #1. Output file which is similar to an input file for SVMs
|
|
545 #2. Output file that is similar to the probabilitic output of a SVM (LibSVM)
|
|
546 #Manly for making output of supervised methods (Distance from Centroid) similar
|
|
547 #MicropitaVis needs some of these files
|
|
548 if xOutputSupFile:
|
|
549 if fAppendFiles:
|
|
550 SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
|
|
551 lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
|
|
552 else:
|
|
553 SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
|
|
554 sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
|
|
555
|
|
556 #Will contain the samples selected to return
|
|
557 #One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type
|
|
558 dictSelectedSamplesRet = dict()
|
|
559 for sKey, ltplDistances in dictlltpleDistanceMeasurements.items():
|
|
560 if fRunDistinct:
|
|
561 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]])
|
|
562 if fRunDiscriminant:
|
|
563 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]])
|
|
564
|
|
565 if xPredictSupFile:
|
|
566 dictFlattenedDistances = dict()
|
|
567 [dictFlattenedDistances.setdefault(sKey, []).append(tple)
|
|
568 for sKey, lltple in dictlltpleDistanceMeasurements.items()
|
|
569 for ltple in lltple for tple in ltple]
|
|
570 if fAppendFiles:
|
|
571 self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
|
|
572 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
|
|
573 else:
|
|
574 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
|
|
575 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
|
|
576 return dictSelectedSamplesRet
|
|
577
|
|
578 #Two happy path test cases
|
|
579 def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames):
|
|
580 """
|
|
581 Manages updating the predict file.
|
|
582
|
|
583 :param xPredictSupFile: File that has predictions (distances) from the supervised method.
|
|
584 :type: FileStream or String file path
|
|
585 :param xInputLabelsFile: File that as input to the supervised methods.
|
|
586 :type: FileStream or String file path
|
|
587 :param dictltpleDistanceMeasurements:
|
|
588 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
|
|
589 """
|
|
590
|
|
591 if not isinstance(xPredictSupFile, str):
|
|
592 xPredictSupFile.close()
|
|
593 xPredictSupFile = xPredictSupFile.name
|
|
594 csvr = open(xPredictSupFile,'r')
|
|
595
|
|
596 f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
|
|
597 lsHeader = f.next()[1:]
|
|
598 dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader])
|
|
599
|
|
600 #Read data in
|
|
601 iSampleIndex = 0
|
|
602 for sRow in f:
|
|
603 sLabel = sRow[0]
|
|
604 [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:])
|
|
605 if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue]
|
|
606 iSampleIndex += 1
|
|
607
|
|
608 #Combine dictltpleDistanceMeasurements with new data
|
|
609 #If they share a key then merge keeping parameter data
|
|
610 #If they do not share the key, keep the full data
|
|
611 dictNew = {}
|
|
612 for sKey in dictltpleDistanceMeasurements.keys():
|
|
613 lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]]
|
|
614 dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey]
|
|
615 for sKey in dictlltpleRead:
|
|
616 if sKey not in dictltpleDistanceMeasurements.keys():
|
|
617 dictNew[sKey] = dictlltpleRead[sKey]
|
|
618
|
|
619 #Call writer
|
|
620 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile,
|
|
621 dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable,
|
|
622 lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True)
|
|
623
|
|
624 #2 happy path test cases
|
|
625 def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False):
|
|
626 """
|
|
627 Write to the predict file.
|
|
628
|
|
629 :param xPredictSupFile: File that has predictions (distances) from the supervised method.
|
|
630 :type: FileStream or String file path
|
|
631 :param xInputLabelsFile: File that as input to the supervised methods.
|
|
632 :type: FileStream or String file path
|
|
633 :param dictltpleDistanceMeasurements:
|
|
634 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
|
|
635 :param abundanceTable: An abundance table of the sample data.
|
|
636 :type: AbundanceTable
|
|
637 :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing.
|
|
638 Otherwise will use the sample names from the abundance table.
|
|
639 :type: List of strings
|
|
640 :param fFromUpdate: Indicates if this is part of an update to the file or not.
|
|
641 :type: Boolean
|
|
642 """
|
|
643
|
|
644 xInputLabelsFileName = xInputLabelsFile
|
|
645 if not isinstance(xInputLabelsFile,str):
|
|
646 xInputLabelsFileName = xInputLabelsFile.name
|
|
647 f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
|
|
648
|
|
649 lsAllSampleNames = abundanceTable.funcGetSampleNames()
|
|
650 lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames,
|
|
651 isPredictFile=False)
|
|
652 dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]])
|
|
653
|
|
654 #Dictionay keys will be used to order the predict file
|
|
655 lsMeasurementKeys = dictltpleDistanceMeasurements.keys()
|
|
656 #Make header
|
|
657 f.writerow(["labels"]+lsMeasurementKeys)
|
|
658
|
|
659 #Reformat dictionary to make it easier to use
|
|
660 for sKey in dictltpleDistanceMeasurements:
|
|
661 dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]])
|
|
662
|
|
663 for sSample in lsOriginalSampleNames:
|
|
664 #Make body of file
|
|
665 f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+
|
|
666 [str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue))
|
|
667 for sKey in lsMeasurementKeys])
|
|
668
|
|
669 def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics,
|
|
670 fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None,
|
|
671 istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False):
|
|
672 """
|
|
673 Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other
|
|
674 for the set that should be normalized.
|
|
675
|
|
676 :param abndData: Abundance table object holding the samples to be measured.
|
|
677 :type: AbundanceTable
|
|
678 :param iSampleSelectionCount The number of samples to select per method.
|
|
679 :type: Integer
|
|
680 :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}.
|
|
681 :type: Dictionary
|
|
682 :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity).
|
|
683 :type: List of strings
|
|
684 :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative).
|
|
685 :type: List of strings
|
|
686 :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar).
|
|
687 :type: List of strings
|
|
688 :param fRunDiversity: Run Diversity based methods (true indicates run).
|
|
689 :type: Boolean
|
|
690 :param fRunRepresentative: Run Representative based methods (true indicates run).
|
|
691 :type: Boolean
|
|
692 :param fRunExtreme: Run Extreme based methods (true indicates run).
|
|
693 :type: Boolean
|
|
694 :param istmBetaMatrix: File that has a precalculated beta matrix
|
|
695 :type: File stream or File path string
|
|
696 :return Selected Samples: Samples selected by methods.
|
|
697 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
|
|
698 """
|
|
699
|
|
700 #Sample ids/names
|
|
701 lsSampleNames = abndData.funcGetSampleNames()
|
|
702
|
|
703 #Generate alpha metrics and get most diverse
|
|
704 if fRunDiversity:
|
|
705
|
|
706 #Get Alpha metrics matrix
|
|
707 internalAlphaMatrix = None
|
|
708 #Name of technique
|
|
709 strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics
|
|
710
|
|
711 #If given an alpha-diversity metadata
|
|
712 if strAlphaMetadata:
|
|
713 internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]]
|
|
714 else:
|
|
715 #Expects Observations (Taxa (row) x sample (column))
|
|
716 #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
|
|
717 internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy()
|
|
718 if not abndData.funcIsSummed()
|
|
719 else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(),
|
|
720 lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics)
|
|
721
|
|
722 if internalAlphaMatrix:
|
|
723 #Invert measurments
|
|
724 if fInvertDiversity:
|
|
725 lldNewDiversity = []
|
|
726 for lsLine in internalAlphaMatrix:
|
|
727 lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine])
|
|
728 internalAlphaMatrix = lldNewDiversity
|
|
729 #Get top ranked alpha diversity by most diverse
|
|
730 #Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...]
|
|
731 #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]]
|
|
732 mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount)
|
|
733
|
|
734 #Add to results
|
|
735 for index in xrange(0,len(strMethod)):
|
|
736 strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index])
|
|
737 dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index])
|
|
738
|
|
739 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b")
|
|
740 logging.info(dictSelectedSamples)
|
|
741
|
|
742 #Generate beta metrics and
|
|
743 if fRunRepresentative or fRunExtreme:
|
|
744
|
|
745 #Abundance matrix transposed
|
|
746 npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True)
|
|
747
|
|
748 #Get center selection using clusters/tiling
|
|
749 #This will be for beta metrics in normalized space
|
|
750 if fRunRepresentative:
|
|
751
|
|
752 if istmBetaMatrix:
|
|
753 #Get representative dissimilarity samples
|
|
754 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
|
|
755
|
|
756 if medoidSamples:
|
|
757 dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples)
|
|
758 else:
|
|
759 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.")
|
|
760 for bMetric in lsBetaMetrics:
|
|
761
|
|
762 #Get representative dissimilarity samples
|
|
763 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
|
|
764
|
|
765 if medoidSamples:
|
|
766 dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples)
|
|
767
|
|
768 #Get extreme selection using clusters, tiling
|
|
769 if fRunExtreme:
|
|
770 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.")
|
|
771 if istmBetaMatrix:
|
|
772
|
|
773 #Samples for representative dissimilarity
|
|
774 #This involves inverting the distance metric,
|
|
775 #Taking the dendrogram level of where the number cluster == the number of samples to select
|
|
776 #Returning a repersentative sample from each cluster
|
|
777 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
|
|
778
|
|
779 #Add selected samples
|
|
780 if extremeSamples:
|
|
781 dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples)
|
|
782
|
|
783 else:
|
|
784 #Run KMedoids with inverse custom distance metric in normalized space
|
|
785 for bMetric in lsInverseBetaMetrics:
|
|
786
|
|
787 #Samples for representative dissimilarity
|
|
788 #This involves inverting the distance metric,
|
|
789 #Taking the dendrogram level of where the number cluster == the number of samples to select
|
|
790 #Returning a repersentative sample from each cluster
|
|
791 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
|
|
792
|
|
793 #Add selected samples
|
|
794 if extremeSamples:
|
|
795 dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples)
|
|
796
|
|
797 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b")
|
|
798 logging.info(dictSelectedSamples)
|
|
799 return dictSelectedSamples
|
|
800
|
|
801 def funcRun(self, strIDName, strLastMetadataName, istmInput,
|
|
802 ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput,
|
|
803 cDelimiter, cFeatureNameDelimiter, strFeatureSelection,
|
|
804 istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None,
|
|
805 strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None,
|
|
806 iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False):
|
|
807 """
|
|
808 Manages the selection of samples given different metrics.
|
|
809
|
|
810 :param strIDName: Sample Id metadata row
|
|
811 :type: String
|
|
812 :param strLastMetadataName: The id of the metadata positioned last in the abundance table.
|
|
813 :type: String String metadata id.
|
|
814 :param istmInput: File to store input data to supervised methods.
|
|
815 :type: FileStream of String file path
|
|
816 :param ostmInputPredictFile: File to store distances from supervised methods.
|
|
817 :type: FileStream or String file path
|
|
818 :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked.
|
|
819 :type: FileStream or String file path
|
|
820 :param ostmOutPut: File to store sample selection by methods of interest.
|
|
821 :type: FileStream or String file path
|
|
822 :param cDelimiter: Delimiter of abundance table.
|
|
823 :type: Character Char (default TAB).
|
|
824 :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades).
|
|
825 :type: Character (default |).
|
|
826 :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance).
|
|
827 :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues).
|
|
828 :param istmFeatures: File which holds the features of interest if using targeted feature methodology.
|
|
829 :type: FileStream or String file path
|
|
830 :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible.
|
|
831 :type: Integer integer.
|
|
832 :param lstrMethods: List of strings indicating selection techniques.
|
|
833 :type: List of string method names
|
|
834 :param strLabel: The metadata used for supervised labels.
|
|
835 :type: String
|
|
836 :param strStratify: The metadata used to stratify unsupervised data.
|
|
837 :type: String
|
|
838 :param strCustomAlpha: Custom alpha diversity metric
|
|
839 :type: String
|
|
840 :param strCustomBeta: Custom beta diversity metric
|
|
841 :type: String
|
|
842 :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling
|
|
843 :type: String
|
|
844 :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling
|
|
845 :type: FileStream or String file path
|
|
846 :param istrmTree: File containing tree for phylogentic beta-diversity analysis
|
|
847 :type: FileStream or String file path
|
|
848 :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis
|
|
849 :type: FileStream or String file path
|
|
850 :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples.
|
|
851 :type: Integer
|
|
852 :param iMinSamples: Minimum sample count for the occurence filter.
|
|
853 :type: Integer
|
|
854 :param fInvertDiversity: When true will invert diversity measurements before using.
|
|
855 :type: boolean
|
|
856 :return Selected Samples: Samples selected by methods.
|
|
857 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
|
|
858 """
|
|
859
|
|
860 #Holds the top ranked samples from different metrics
|
|
861 #dict[metric name] = [samplename,samplename...]
|
|
862 selectedSamples = dict()
|
|
863
|
|
864 #If a target feature file is given make sure that targeted feature is in the selection methods, if not add
|
|
865 if ConstantsMicropita.c_strFeature in lstrMethods:
|
|
866 if not istmFeatures:
|
|
867 logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.")
|
|
868 return False
|
|
869
|
|
870 #Diversity metrics to run
|
|
871 #Use custom metrics if specified
|
|
872 #Custom beta metrics set to normalized only, custom alpha metrics set to count only
|
|
873 diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity]
|
|
874 diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity]
|
|
875 # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity]
|
|
876 diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else []
|
|
877 diversityMetricsBetaNoNormalize = []
|
|
878 # inverseDiversityMetricsBetaNoNormalize = []
|
|
879
|
|
880 #Targeted taxa
|
|
881 userDefinedTaxa = []
|
|
882
|
|
883 #Perform different flows flags
|
|
884 c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods
|
|
885 c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods
|
|
886 c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods
|
|
887 c_RUN_RANK_AVERAGE_USER_4 = False
|
|
888 if ConstantsMicropita.c_strFeature in lstrMethods:
|
|
889 c_RUN_RANK_AVERAGE_USER_4 = True
|
|
890 if not istmFeatures:
|
|
891 logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.")
|
|
892 return False
|
|
893 #Read in taxa list, break down to lines and filter out empty strings
|
|
894 userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines()))
|
|
895 c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods
|
|
896 c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods
|
|
897 c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods
|
|
898
|
|
899 #Read in abundance data
|
|
900 #Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0
|
|
901 #Abundance table object to read in and manage data
|
|
902 totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples],
|
|
903 cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata,
|
|
904 sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile)
|
|
905 if not totalAbundanceTable:
|
|
906 logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+
|
|
907 " This often occurs when the Last Metadata is not specified correctly."+
|
|
908 " Please check to make sure the Last Metadata selection is the row of the last metadata,"+
|
|
909 " all values after this selection should be microbial measurements and should be numeric.")
|
|
910 return False
|
|
911
|
|
912 lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel
|
|
913
|
|
914 dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy()
|
|
915 logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata))
|
|
916 #If there is only 1 unique value for the labels, do not run the Supervised methods
|
|
917 if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ):
|
|
918 logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[])))
|
|
919 return False
|
|
920
|
|
921 #Run unsupervised methods###
|
|
922 #Stratify the data if need be and drop the old data
|
|
923 lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable]
|
|
924
|
|
925 #For each stratified abundance block or for the unstratfified abundance
|
|
926 #Run the unsupervised blocks
|
|
927 fAppendSupFiles = False
|
|
928 for stratAbundanceTable in lStratifiedAbundanceTables:
|
|
929 logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName())
|
|
930
|
|
931 ###NOT SUMMED, NOT NORMALIZED
|
|
932 #Only perform if the data is not yet normalized
|
|
933 if not stratAbundanceTable.funcIsNormalized( ):
|
|
934 #Need to first work with unnormalized data
|
|
935 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
|
|
936
|
|
937 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
|
|
938 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize,
|
|
939 lsBetaMetrics=diversityMetricsBetaNoNormalize,
|
|
940 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize,
|
|
941 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
|
|
942 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata,
|
|
943 istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
|
|
944
|
|
945
|
|
946 #Generate selection by the rank average of user defined taxa
|
|
947 #Expects (Taxa (row) by Samples (column))
|
|
948 #Expects a column 0 of taxa id that is skipped
|
|
949 #Returns [(sample name,average,rank)]
|
|
950 #SUMMED AND NORMALIZED
|
|
951 stratAbundanceTable.funcSumClades()
|
|
952 #Normalize data at this point
|
|
953 stratAbundanceTable.funcNormalize()
|
|
954 if c_RUN_RANK_AVERAGE_USER_4:
|
|
955 selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable,
|
|
956 lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection)
|
|
957 logging.info("MicroPITA.funcRun:: Selected Samples Rank")
|
|
958 logging.info(selectedSamples)
|
|
959
|
|
960 ###SUMMED AND NORMALIZED analysis block
|
|
961 #Diversity based metric will move reduce to terminal taxa as needed
|
|
962 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
|
|
963
|
|
964 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
|
|
965 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha,
|
|
966 lsBetaMetrics=diversityMetricsBeta,
|
|
967 lsInverseBetaMetrics=diversityMetricsBeta,
|
|
968 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
|
|
969 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3,
|
|
970 istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
|
|
971
|
|
972 #5::Select randomly
|
|
973 #Expects sampleNames = List of sample names [name, name, name...]
|
|
974 if(c_RUN_RANDOM_5):
|
|
975 #Select randomly from sample names
|
|
976 selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount)
|
|
977 logging.info("MicroPITA.funcRun:: Selected Samples Random")
|
|
978 logging.info(selectedSamples)
|
|
979
|
|
980 #Perform supervised selection
|
|
981 if c_RUN_DISTINCT or c_RUN_DISCRIMINANT:
|
|
982 if strLabel:
|
|
983 dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable,
|
|
984 fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT,
|
|
985 xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile,
|
|
986 strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount,
|
|
987 lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(),
|
|
988 lsOriginalLabels = lsOriginalLabels,
|
|
989 fAppendFiles=fAppendSupFiles)
|
|
990
|
|
991 [selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()]
|
|
992
|
|
993 if not fAppendSupFiles:
|
|
994 fAppendSupFiles = True
|
|
995 logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised")
|
|
996 logging.info(selectedSamples)
|
|
997 return selectedSamples
|
|
998
|
|
999 #Testing: Happy path tested
|
|
1000 @staticmethod
|
|
1001 def funcWriteSelectionToFile(dictSelection,xOutputFilePath):
|
|
1002 """
|
|
1003 Writes the selection of samples by method to an output file.
|
|
1004
|
|
1005 :param dictSelection: The dictionary of selections by method to be written to a file.
|
|
1006 :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]}
|
|
1007 :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written.
|
|
1008 :type: String FileStream or String path to file
|
|
1009 """
|
|
1010
|
|
1011 if not dictSelection:
|
|
1012 return
|
|
1013
|
|
1014 #Open file
|
|
1015 f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim )
|
|
1016
|
|
1017 #Create output content from dictionary
|
|
1018 for sKey in dictSelection:
|
|
1019 f.writerow([sKey]+dictSelection[sKey])
|
|
1020 logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey]))
|
|
1021
|
|
1022 #Testing: Happy Path tested
|
|
1023 @staticmethod
|
|
1024 def funcReadSelectionFileToDictionary(xInputFile):
|
|
1025 """
|
|
1026 Reads in an output selection file from micropita and formats it into a dictionary.
|
|
1027
|
|
1028 :param xInputFile: String path to file or file stream to read and translate into a dictionary.
|
|
1029 {"method":["sample selected","sample selected"...]}
|
|
1030 :type: FileStream or String Path to file
|
|
1031 :return Dictionary: Samples selected by methods.
|
|
1032 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
|
|
1033 """
|
|
1034
|
|
1035 #Open file
|
|
1036 istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim)
|
|
1037
|
|
1038 #Dictionary to hold selection data
|
|
1039 return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader])
|
|
1040
|
|
1041 #Set up arguments reader
|
|
1042 argp = argparse.ArgumentParser( prog = "MicroPITA.py",
|
|
1043 description = """Selects samples from abundance tables based on various selection schemes.""" )
|
|
1044
|
|
1045 args = argp.add_argument_group( "Common", "Commonly modified options" )
|
|
1046 args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp)
|
|
1047 args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp,
|
|
1048 choices = ConstantsMicropita.c_lsAllMethods, action = "append")
|
|
1049
|
|
1050 args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" )
|
|
1051 args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices = Metric.setAlphaDiversities)
|
|
1052 args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp, choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted])
|
|
1053 args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp)
|
|
1054 args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp)
|
|
1055 args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp)
|
|
1056 args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp)
|
|
1057 args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp)
|
|
1058
|
|
1059 args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" )
|
|
1060 args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp)
|
|
1061 args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None,
|
|
1062 help= ConstantsMicropita.c_strLastMetadataNameHelp)
|
|
1063 args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0],
|
|
1064 choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp)
|
|
1065 args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp)
|
|
1066 args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp)
|
|
1067
|
|
1068 args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" )
|
|
1069 args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp)
|
|
1070 args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id",
|
|
1071 help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp)
|
|
1072
|
|
1073 args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" )
|
|
1074 args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp)
|
|
1075 args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp)
|
|
1076
|
|
1077 args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" )
|
|
1078 args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING",
|
|
1079 choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp)
|
|
1080 args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp)
|
|
1081 args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp)
|
|
1082 args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp)
|
|
1083 args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp)
|
|
1084
|
|
1085 argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp,
|
|
1086 default = sys.stdin)
|
|
1087 argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp,
|
|
1088 default = sys.stdout)
|
|
1089
|
|
1090 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__
|
|
1091
|
|
1092 def _main( ):
|
|
1093 args = argp.parse_args( )
|
|
1094
|
|
1095 #Set up logger
|
|
1096 iLogLevel = getattr(logging, args.strLogLevel.upper(), None)
|
|
1097 logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel)
|
|
1098
|
|
1099 #Run micropita
|
|
1100 logging.info("MicroPITA:: Start microPITA")
|
|
1101 microPITA = MicroPITA()
|
|
1102
|
|
1103 #Argparse will append to the default but will not remove the default so I do this here
|
|
1104 if not len(args.lstrMethods):
|
|
1105 args.lstrMethods = [ConstantsMicropita.c_strRepresentative]
|
|
1106
|
|
1107 dictSelectedSamples = microPITA.funcRun(
|
|
1108 strIDName = args.strIDName,
|
|
1109 strLastMetadataName = args.strLastMetadataName,
|
|
1110 istmInput = args.istmInput,
|
|
1111 ostmInputPredictFile = args.ostmInputPredictFile,
|
|
1112 ostmPredictFile = args.ostmPredictFile,
|
|
1113 ostmCheckedFile = args.ostmCheckedFile,
|
|
1114 ostmOutput = args.ostmOutput,
|
|
1115 cDelimiter = args.cFileDelimiter,
|
|
1116 cFeatureNameDelimiter = args.cFeatureNameDelimiter,
|
|
1117 istmFeatures = args.istmFeatures,
|
|
1118 strFeatureSelection = args.strFeatureSelection,
|
|
1119 iCount = args.iCount,
|
|
1120 strLastRowMetadata = args.strLastFeatureMetadata,
|
|
1121 strLabel = args.strLabel,
|
|
1122 strStratify = args.strUnsupervisedStratify,
|
|
1123 strCustomAlpha = args.strAlphaDiversity,
|
|
1124 strCustomBeta = args.strBetaDiversity,
|
|
1125 strAlphaMetadata = args.strAlphaMetadata,
|
|
1126 istmBetaMatrix = args.istmBetaMatrix,
|
|
1127 istrmTree = args.istrmTree,
|
|
1128 istrmEnvr = args.istrmEnvr,
|
|
1129 lstrMethods = args.lstrMethods,
|
|
1130 fInvertDiversity = args.fInvertDiversity
|
|
1131 )
|
|
1132
|
|
1133 if not dictSelectedSamples:
|
|
1134 logging.error("MicroPITA:: Error, did not get a result from analysis.")
|
|
1135 return -1
|
|
1136 logging.info("End microPITA")
|
|
1137
|
|
1138 #Log output for debugging
|
|
1139 logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples))
|
|
1140
|
|
1141 #Write selection to file
|
|
1142 microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput)
|
|
1143
|
|
1144 if __name__ == "__main__":
|
|
1145 _main( )
|