comparison src/merge_metadata.py @ 8:e9677425c6c3 default tip

Updated the structure of the libraries
author george.weingart@gmail.com
date Mon, 09 Feb 2015 12:17:40 -0500
parents e0b5980139d9
children
comparison
equal deleted inserted replaced
7:c72e14eabb08 8:e9677425c6c3
1 #!/usr/bin/env python
2 #####################################################################################
3 #Copyright (C) <2012>
4 #
5 #Permission is hereby granted, free of charge, to any person obtaining a copy of
6 #this software and associated documentation files (the "Software"), to deal in the
7 #Software without restriction, including without limitation the rights to use, copy,
8 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
9 #and to permit persons to whom the Software is furnished to do so, subject to
10 #the following conditions:
11 #
12 #The above copyright notice and this permission notice shall be included in all copies
13 #or substantial portions of the Software.
14 #
15 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
17 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 #
22 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models),
23 # authored by the Huttenhower lab at the Harvard School of Public Health
24 # (contact Timothy Tickle, ttickle@hsph.harvard.edu).
25 #####################################################################################
26 """
27 Examples
28 ~~~~~~~~
29
30 ``metadata.txt``::
31
32 - Y Z
33 a 1 x
34 b 0 y
35 c z
36
37 ``data.pcl``::
38
39 - a b c
40 A|B 1 2 3
41 A|C 4 5 6
42 D|E 7 8 9
43
44 ``Examples``::
45
46 $ merge_metadata.py metadata.txt < data.pcl
47 sample a b c
48 Y 1 0
49 Z x y z
50 A 0.416667 0.466667 0.5
51 A|B 0.0833333 0.133333 0.166667
52 A|C 0.333333 0.333333 0.333333
53 D|E 0.583333 0.533333 0.5
54
55 $ merge_metadata.py metadata.txt -t 0 < data.pcl
56 sample a b c
57 Y 1 0
58 Z x y z
59 A|B 0.0833333 0.133333 0.166667
60 A|C 0.333333 0.333333 0.333333
61 D|E 0.583333 0.533333 0.5
62
63 $ merge_metadata.py metadata.txt -t 1 < data.pcl
64 sample a b c
65 Y 1 0
66 Z x y z
67 A 0.416667 0.466667 0.5
68 D 0.583333 0.533333 0.5
69
70 $ merge_metadata.py metadata.txt -t 0 -n < data.pcl
71 sample a b c
72 Y 1 0
73 Z x y z
74 A|B 1 2 3
75 A|C 4 5 6
76 D|E 7 8 9
77
78 $ merge_metadata.py metadata.txt -t 0 -m 0.8 -s "-" < data.pcl
79 sample b c
80 Y 0 -
81 Z y z
82 A|B 0.133333 0.166667
83 A|C 0.333333 0.333333
84 D|E 0.533333 0.5
85
86 $ merge_metadata.py -t 0 < data.pcl
87 sample a b c
88 A|B 1 2 3
89 A|C 4 5 6
90 D|E 7 8 9
91
92 .. testsetup::
93
94 from merge_metadata import *
95 """
96
97 import argparse
98 import blist
99 import csv
100 import re
101 import sys
102
103 c_dTarget = 1.0
104 c_fRound = False
105
106 class CClade:
107
108 def __init__( self ):
109
110 self.m_hashChildren = {}
111 self.m_adValues = None
112
113 def get( self, astrClade ):
114
115 return self.m_hashChildren.setdefault(
116 astrClade[0], CClade( ) ).get( astrClade[1:] ) if astrClade else self
117
118 def set( self, adValues ):
119
120 self.m_adValues = blist.blist( [0] ) * len( adValues )
121 for i, d in enumerate( adValues ):
122 if d:
123 self.m_adValues[i] = d
124
125 def impute( self ):
126
127 if not self.m_adValues:
128 for pChild in self.m_hashChildren.values( ):
129 adChild = pChild.impute( )
130 if self.m_adValues:
131 for i in range( len( adChild or [] ) ):
132 if adChild[i]:
133 self.m_adValues[i] += adChild[i]
134 elif adChild:
135 self.m_adValues = adChild[:]
136
137 return self.m_adValues
138
139 def _freeze( self, hashValues, iTarget, astrClade, iDepth, fLeaves ):
140
141 fHit = ( not iTarget ) or ( ( fLeaves and ( iDepth == iTarget ) ) or ( ( not fLeaves ) and ( iDepth <= iTarget ) ) )
142 iDepth += 1
143 setiRet = set()
144 if self.m_hashChildren:
145 for strChild, pChild in self.m_hashChildren.items( ):
146 setiRet |= pChild._freeze( hashValues, iTarget, astrClade + [strChild], iDepth, fLeaves )
147 setiRet = set( ( i + 1 ) for i in setiRet )
148 else:
149 setiRet.add( 0 )
150 if iTarget < 0:
151 if fLeaves:
152 fHit = -( iTarget + 1 ) in setiRet
153 else:
154 fHit = -( iTarget + 1 ) <= max( setiRet )
155 if astrClade and self.m_adValues and fHit:
156 hashValues["|".join( astrClade )] = self.m_adValues
157 return setiRet
158
159 def freeze( self, hashValues, iTarget, fLeaves ):
160
161 self._freeze( hashValues, iTarget, [], 0, fLeaves )
162
163 def _repr( self, strClade ):
164
165 strRet = "<"
166 if strClade:
167 strRet += "%s %s" % (strClade, self.m_adValues)
168 if self.m_hashChildren:
169 strRet += " "
170 if self.m_hashChildren:
171 strRet += " ".join( p._repr( s ) for (s, p) in self.m_hashChildren.items( ) )
172
173 return ( strRet + ">" )
174
175 def __repr__( self ):
176
177 return self._repr( "" )
178
179 """
180 pTree = CClade( )
181 pTree.get( ("A", "B") ).set( [1, 2, 3] )
182 pTree.get( ("A", "C") ).set( [4, 5, 6] )
183 pTree.get( ("D", "E") ).set( [7, 8, 9] )
184 iTaxa = 0
185 if iTaxa:
186 pTree.impute( )
187 hashFeatures = {}
188 pTree.freeze( hashFeatures, iTaxa )
189 print( pTree )
190 print( hashFeatures )
191 sys.exit( 0 )
192 #"""
193
194 def merge_metadata( aastrMetadata, aastrData, ostm, fNormalize, strMissing, astrExclude, dMin, iTaxa, fLeaves ):
195 """
196 Joins and outputs a data matrix with a metadata matrix, optionally normalizing and filtering it.
197 A pipe-delimited taxonomy hierarchy can also be dynamically added or removed.
198
199 :param aastrMetadata: Split lines from which metadata are read.
200 :type aastrMetadata: collection of string collections
201 :param aastrData: Split lines from which data are read.
202 :type aastrData: collection of string collections
203 :param ostm: Output stream to which joined rows are written.
204 :type ostm: output stream
205 :param fNormalize: If true, divide data values by column sums.
206 :type fNormalize: bool
207 :param strMissing: Representation for missing metadata values.
208 :type strMissing: str
209 :param astrExclude: Lines from which excluded IDs are read.
210 :type astrExclude: collection of strings
211 :param dMin: Minimum fraction of maximum value for per-column quality control.
212 :type dMin: bool
213 :param iTaxa: Depth of taxonomy to be computed, -1 = leaves only, 0 = no change
214 :type iTaxa: int
215 :param fLeaves: Output only leaves, not complete taxonomy; ignored if taxa = 0
216 :type fLeaves: bool
217
218 Metadata are optional; if not provided, data will be optionally normalized or its taxonomy
219 modified as requested. Metadata are provided one row per sample, data one column per
220 sample, both files tab-delimited text with one header row and one header column.
221
222 Metadata IDs that do not match data IDs are discarded, and data IDs without corresponding
223 metadata IDs are given missing values. Missing data values are always treated (and output)
224 as zero.
225
226 Per-column quality control is performed if the requested minimum fraction is greater than
227 zero. Specifically, for each column i, the row j containing the maximum value d is
228 identified. If d is less than the minimum fraction of row j's maximum value over all columns,
229 the entire column i is removed.
230
231 A taxonomy hierarchy will be calculated by default if row IDs are pipe-delimited, i.e. of
232 the form A|B|C. All parent clades are computed by default, e.g. A|B and A, save when
233 they would be identical to a more specific child clade. Negative values are counted from the
234 bottom (right) of the hierarchy rather than the top. The special value of 0 deactivates
235 hierarchy calculation.
236
237 >>> aastrMetadata = [[t.strip( ) for t in s] for s in ("-YZ", "a1x", "b0y", "c z")]
238 >>> aastrData = [s.split( ) for s in ( \
239 "- a b c", \
240 "A|B 1 2 3", \
241 "A|C 4 5 6", \
242 "D|E 7 8 9")]
243 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE
244 sample a b c
245 Y 1 0
246 Z x y z
247 A 0.416667 0.466667 0.5
248 A|B 0.0833333 0.133333 0.166667
249 A|C 0.333333 0.333333 0.333333
250 D|E 0.583333 0.533333 0.5
251
252 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, True ) #doctest: +NORMALIZE_WHITESPACE
253 sample a b c
254 Y 1 0
255 Z x y z
256 A|B 0.0833333 0.133333 0.166667
257 A|C 0.333333 0.333333 0.333333
258 D|E 0.583333 0.533333 0.5
259
260 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
261 sample a b c
262 Y 1 0
263 Z x y z
264 A|B 0.0833333 0.133333 0.166667
265 A|C 0.333333 0.333333 0.333333
266 D|E 0.583333 0.533333 0.5
267
268 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 1, False ) #doctest: +NORMALIZE_WHITESPACE
269 sample a b c
270 Y 1 0
271 Z x y z
272 A 0.416667 0.466667 0.5
273 D 0.583333 0.533333 0.5
274
275 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, -1, True ) #doctest: +NORMALIZE_WHITESPACE
276 sample a b c
277 Y 1 0
278 Z x y z
279 A|B 0.0833333 0.133333 0.166667
280 A|C 0.333333 0.333333 0.333333
281 D|E 0.583333 0.533333 0.5
282
283 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
284 sample a b c
285 Y 1 0
286 Z x y z
287 A|B 1 2 3
288 A|C 4 5 6
289 D|E 7 8 9
290
291 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "-", [], 0.8, 0, True ) #doctest: +NORMALIZE_WHITESPACE
292 sample b c
293 Y 0 -
294 Z y z
295 A|B 0.133333 0.166667
296 A|C 0.333333 0.333333
297 D|E 0.533333 0.5
298
299 >>> merge_metadata( None, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
300 sample a b c
301 A|B 1 2 3
302 A|C 4 5 6
303 D|E 7 8 9
304
305 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", ["b"], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE
306 sample a c
307 Y 1
308 Z x z
309 A 0.416667 0.5
310 A|B 0.0833333 0.166667
311 A|C 0.333333 0.333333
312 D|E 0.583333 0.5
313 """
314
315 #Put metadata in a dictionary
316 #{"First line element",["line element 2","line element 3","line element 4"]}
317 #If there is no metadata then
318 astrMetadata = None
319 hashMetadata = {}
320 for astrLine in ( aastrMetadata or [] ):
321 if astrMetadata:
322 hashMetadata[astrLine[0]] = astrLine[1:]
323 else:
324 astrMetadata = astrLine[1:]
325
326 astrHeaders = adSeqs = iCol = None
327 pTree = CClade( )
328 aastrRaw = []
329 for astrLine in aastrData:
330 if astrHeaders:
331 if ( astrLine[0] == "EWEIGHT" ) or ( astrLine[0] == "total" ) or \
332 ( len( astrLine ) < 2 ):
333 continue
334 try:
335 adCounts = [( float(strCur) if len( strCur.strip( ) ) else 0 ) for
336 strCur in astrLine[iCol:]]
337 except ValueError:
338 aastrRaw.append( astrLine )
339 continue
340 for i in range( len( adCounts ) ):
341 adSeqs[i] += adCounts[i]
342 if ( iCol > 1 ) and ( astrLine[0] != astrLine[1] ):
343 if astrLine[1].find( astrLine[0] ) >= 0:
344 astrLine[0] = astrLine[1]
345 else:
346 astrLine[0] += " " + astrLine[1]
347 pTree.get( astrLine[0].split( "|" ) ).set( adCounts )
348 else:
349 iCol = 2 if ( astrLine[1].upper( ) == "NAME" ) else 1
350 astrHeaders = [strCur.replace( " ", "_" ) for strCur in astrLine[iCol:]]
351 adSeqs = [0] * len( astrHeaders )
352
353 if iTaxa:
354 pTree.impute( )
355 hashFeatures = {}
356 pTree.freeze( hashFeatures, iTaxa, fLeaves )
357 setstrFeatures = hashFeatures.keys( )
358
359 afOmit = [False] * len( astrHeaders )
360 if dMin > 0:
361 aadData = list(hashFeatures.values( ))
362 for i in range( len( astrHeaders ) ):
363 iMax = max( range( len( aadData ) ), key = lambda j: aadData[j][i] )
364 dMaxUs = aadData[iMax][i]
365 dMaxThem = max( aadData[iMax][j] for j in ( range( i ) + range( i + 1, len( astrHeaders ) ) ) )
366 if dMaxUs < ( dMin * dMaxThem ):
367 sys.stderr.write( "Omitting: %s\n" % astrHeaders[i] )
368 afOmit[i] = True
369
370 if astrExclude:
371 setstrExclude = set(s.strip( ) for s in astrExclude)
372 for i in range( len( astrHeaders ) ):
373 if ( not afOmit[i] ) and ( astrHeaders[i] in setstrExclude ):
374 afOmit[i] = True
375
376 adMult = [( ( c_dTarget / d ) if ( fNormalize and ( d > 0 ) ) else 1 ) for d in adSeqs]
377 for strFeature, adCounts in hashFeatures.items( ):
378 for i in range( len( adCounts ) ):
379 if adCounts[i]:
380 adCounts[i] *= adMult[i]
381 if c_fRound:
382 adCounts[i] = round( adCounts[i] )
383 for strFeature, adCounts in hashFeatures.items( ):
384 astrFeature = strFeature.strip( ).split( "|" )
385 while len( astrFeature ) > 1:
386 astrFeature = astrFeature[:-1]
387 strParent = "|".join( astrFeature )
388 adParent = hashFeatures.get( strParent )
389 if adParent == adCounts:
390 del hashFeatures[strParent]
391 setstrFeatures.remove( strParent )
392
393 if astrMetadata:
394 for i in range( len( astrMetadata ) ):
395 hashFeatures[astrMetadata[i]] = astrCur = []
396 for strSubject in astrHeaders:
397 astrSubject = hashMetadata.get( strSubject )
398 if not astrSubject:
399 strSubject = re.sub( '_.*$', "", strSubject )
400 astrSubject = hashMetadata.get( strSubject, [] )
401 astrCur.append( astrSubject[i] if ( i < len( astrSubject ) ) else "" )
402
403 astrFeatures = sorted( astrMetadata or [] ) + sorted( setstrFeatures )
404 aiHeaders = filter( lambda i: not afOmit[i], range( len( astrHeaders ) ) )
405 csvw = csv.writer( sys.stdout, csv.excel_tab )
406 csvw.writerow( ["sample"] + [astrHeaders[i] for i in aiHeaders] )
407 for iFeature in range( len( astrFeatures ) ):
408 strFeature = astrFeatures[iFeature]
409 adFeature = hashFeatures[strFeature]
410 astrValues = [adFeature[i] for i in aiHeaders]
411 for i in range( len( astrValues ) ):
412 strValue = astrValues[i]
413 if type( strValue ) in (int, float):
414 astrValues[i] = "%g" % astrValues[i]
415 elif ( not strValue ) or ( ( type( strValue ) == str ) and
416 ( len( strValue ) == 0 ) ):
417 astrValues[i] = strMissing
418 csvw.writerow( [strFeature] + astrValues )
419
420 for astrRaw in aastrRaw:
421 csvw.writerow( [astrRaw[i] for i in aiHeaders] )
422
423 argp = argparse.ArgumentParser( prog = "merge_metadata.py",
424 description = "Join a data matrix with a metadata matrix, optionally normalizing and filtering it.\n\n" +
425 "A pipe-delimited taxonomy hierarchy can also be dynamically added or removed." )
426 argp.add_argument( "-n", dest = "fNormalize", action = "store_false",
427 help = "Don't normalize data values by column sums" )
428 argp.add_argument( "-s", dest = "strMissing", metavar = "missing",
429 type = str, default = " ",
430 help = "String representing missing metadata values" )
431 argp.add_argument( "-m", dest = "dMin", metavar = "min",
432 type = float, default = 0.01,
433 help = "Per-column quality control, minimum fraction of maximum value" )
434 argp.add_argument( "-t", dest = "iTaxa", metavar = "taxa",
435 type = int, default = -1,
436 help = "Depth of taxonomy to be computed, negative = from right, 0 = no change" )
437 argp.add_argument( "-l", dest = "fLeaves", action = "store_true",
438 help = "Output only leaves, not complete taxonomy" )
439 argp.add_argument( "-x", dest = "istmExclude", metavar = "exclude.txt",
440 type = file,
441 help = "File from which sample IDs to exclude are read" )
442 argp.add_argument( "istmMetadata", metavar = "metadata.txt",
443 type = file, nargs = "?",
444 help = "File from which metadata is read" )
445 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__
446
447 def _main( ):
448 args = argp.parse_args( )
449 merge_metadata( args.istmMetadata and csv.reader( args.istmMetadata, csv.excel_tab ),
450 csv.reader( sys.stdin, csv.excel_tab ), sys.stdout, args.fNormalize, args.strMissing,
451 args.istmExclude, args.dMin, args.iTaxa, args.fLeaves )
452
453 if __name__ == "__main__":
454 _main( )