annotate maaslin-4450aa4ecc84/src/merge_metadata.py @ 1:a87d5a5f2776

Uploaded the version running on the prod server
author george-weingart
date Sun, 08 Feb 2015 23:08:38 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
1 #!/usr/bin/env python
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
2 #####################################################################################
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
3 #Copyright (C) <2012>
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
4 #
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
5 #Permission is hereby granted, free of charge, to any person obtaining a copy of
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
6 #this software and associated documentation files (the "Software"), to deal in the
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
7 #Software without restriction, including without limitation the rights to use, copy,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
8 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
9 #and to permit persons to whom the Software is furnished to do so, subject to
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
10 #the following conditions:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
11 #
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
12 #The above copyright notice and this permission notice shall be included in all copies
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
13 #or substantial portions of the Software.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
14 #
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
15 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
16 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
17 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
18 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
19 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
20 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
21 #
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
22 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models),
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
23 # authored by the Huttenhower lab at the Harvard School of Public Health
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
24 # (contact Timothy Tickle, ttickle@hsph.harvard.edu).
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
25 #####################################################################################
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
26 """
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
27 Examples
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
28 ~~~~~~~~
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
29
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
30 ``metadata.txt``::
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
31
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
32 - Y Z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
33 a 1 x
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
34 b 0 y
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
35 c z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
36
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
37 ``data.pcl``::
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
38
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
39 - a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
40 A|B 1 2 3
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
41 A|C 4 5 6
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
42 D|E 7 8 9
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
43
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
44 ``Examples``::
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
45
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
46 $ merge_metadata.py metadata.txt < data.pcl
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
47 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
48 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
49 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
50 A 0.416667 0.466667 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
51 A|B 0.0833333 0.133333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
52 A|C 0.333333 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
53 D|E 0.583333 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
54
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
55 $ merge_metadata.py metadata.txt -t 0 < data.pcl
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
56 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
57 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
58 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
59 A|B 0.0833333 0.133333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
60 A|C 0.333333 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
61 D|E 0.583333 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
62
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
63 $ merge_metadata.py metadata.txt -t 1 < data.pcl
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
64 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
65 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
66 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
67 A 0.416667 0.466667 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
68 D 0.583333 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
69
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
70 $ merge_metadata.py metadata.txt -t 0 -n < data.pcl
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
71 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
72 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
73 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
74 A|B 1 2 3
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
75 A|C 4 5 6
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
76 D|E 7 8 9
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
77
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
78 $ merge_metadata.py metadata.txt -t 0 -m 0.8 -s "-" < data.pcl
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
79 sample b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
80 Y 0 -
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
81 Z y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
82 A|B 0.133333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
83 A|C 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
84 D|E 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
85
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
86 $ merge_metadata.py -t 0 < data.pcl
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
87 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
88 A|B 1 2 3
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
89 A|C 4 5 6
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
90 D|E 7 8 9
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
91
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
92 .. testsetup::
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
93
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
94 from merge_metadata import *
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
95 """
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
96
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
97 import argparse
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
98 import blist
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
99 import csv
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
100 import re
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
101 import sys
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
102
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
103 c_dTarget = 1.0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
104 c_fRound = False
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
105
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
106 class CClade:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
107
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
108 def __init__( self ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
109
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
110 self.m_hashChildren = {}
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
111 self.m_adValues = None
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
112
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
113 def get( self, astrClade ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
114
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
115 return self.m_hashChildren.setdefault(
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
116 astrClade[0], CClade( ) ).get( astrClade[1:] ) if astrClade else self
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
117
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
118 def set( self, adValues ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
119
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
120 self.m_adValues = blist.blist( [0] ) * len( adValues )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
121 for i, d in enumerate( adValues ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
122 if d:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
123 self.m_adValues[i] = d
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
124
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
125 def impute( self ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
126
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
127 if not self.m_adValues:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
128 for pChild in self.m_hashChildren.values( ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
129 adChild = pChild.impute( )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
130 if self.m_adValues:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
131 for i in range( len( adChild or [] ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
132 if adChild[i]:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
133 self.m_adValues[i] += adChild[i]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
134 elif adChild:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
135 self.m_adValues = adChild[:]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
136
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
137 return self.m_adValues
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
138
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
139 def _freeze( self, hashValues, iTarget, astrClade, iDepth, fLeaves ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
140
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
141 fHit = ( not iTarget ) or ( ( fLeaves and ( iDepth == iTarget ) ) or ( ( not fLeaves ) and ( iDepth <= iTarget ) ) )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
142 iDepth += 1
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
143 setiRet = set()
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
144 if self.m_hashChildren:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
145 for strChild, pChild in self.m_hashChildren.items( ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
146 setiRet |= pChild._freeze( hashValues, iTarget, astrClade + [strChild], iDepth, fLeaves )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
147 setiRet = set( ( i + 1 ) for i in setiRet )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
148 else:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
149 setiRet.add( 0 )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
150 if iTarget < 0:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
151 if fLeaves:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
152 fHit = -( iTarget + 1 ) in setiRet
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
153 else:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
154 fHit = -( iTarget + 1 ) <= max( setiRet )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
155 if astrClade and self.m_adValues and fHit:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
156 hashValues["|".join( astrClade )] = self.m_adValues
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
157 return setiRet
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
158
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
159 def freeze( self, hashValues, iTarget, fLeaves ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
160
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
161 self._freeze( hashValues, iTarget, [], 0, fLeaves )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
162
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
163 def _repr( self, strClade ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
164
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
165 strRet = "<"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
166 if strClade:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
167 strRet += "%s %s" % (strClade, self.m_adValues)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
168 if self.m_hashChildren:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
169 strRet += " "
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
170 if self.m_hashChildren:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
171 strRet += " ".join( p._repr( s ) for (s, p) in self.m_hashChildren.items( ) )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
172
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
173 return ( strRet + ">" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
174
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
175 def __repr__( self ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
176
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
177 return self._repr( "" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
178
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
179 """
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
180 pTree = CClade( )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
181 pTree.get( ("A", "B") ).set( [1, 2, 3] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
182 pTree.get( ("A", "C") ).set( [4, 5, 6] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
183 pTree.get( ("D", "E") ).set( [7, 8, 9] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
184 iTaxa = 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
185 if iTaxa:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
186 pTree.impute( )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
187 hashFeatures = {}
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
188 pTree.freeze( hashFeatures, iTaxa )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
189 print( pTree )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
190 print( hashFeatures )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
191 sys.exit( 0 )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
192 #"""
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
193
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
194 def merge_metadata( aastrMetadata, aastrData, ostm, fNormalize, strMissing, astrExclude, dMin, iTaxa, fLeaves ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
195 """
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
196 Joins and outputs a data matrix with a metadata matrix, optionally normalizing and filtering it.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
197 A pipe-delimited taxonomy hierarchy can also be dynamically added or removed.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
198
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
199 :param aastrMetadata: Split lines from which metadata are read.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
200 :type aastrMetadata: collection of string collections
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
201 :param aastrData: Split lines from which data are read.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
202 :type aastrData: collection of string collections
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
203 :param ostm: Output stream to which joined rows are written.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
204 :type ostm: output stream
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
205 :param fNormalize: If true, divide data values by column sums.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
206 :type fNormalize: bool
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
207 :param strMissing: Representation for missing metadata values.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
208 :type strMissing: str
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
209 :param astrExclude: Lines from which excluded IDs are read.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
210 :type astrExclude: collection of strings
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
211 :param dMin: Minimum fraction of maximum value for per-column quality control.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
212 :type dMin: bool
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
213 :param iTaxa: Depth of taxonomy to be computed, -1 = leaves only, 0 = no change
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
214 :type iTaxa: int
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
215 :param fLeaves: Output only leaves, not complete taxonomy; ignored if taxa = 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
216 :type fLeaves: bool
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
217
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
218 Metadata are optional; if not provided, data will be optionally normalized or its taxonomy
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
219 modified as requested. Metadata are provided one row per sample, data one column per
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
220 sample, both files tab-delimited text with one header row and one header column.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
221
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
222 Metadata IDs that do not match data IDs are discarded, and data IDs without corresponding
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
223 metadata IDs are given missing values. Missing data values are always treated (and output)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
224 as zero.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
225
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
226 Per-column quality control is performed if the requested minimum fraction is greater than
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
227 zero. Specifically, for each column i, the row j containing the maximum value d is
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
228 identified. If d is less than the minimum fraction of row j's maximum value over all columns,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
229 the entire column i is removed.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
230
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
231 A taxonomy hierarchy will be calculated by default if row IDs are pipe-delimited, i.e. of
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
232 the form A|B|C. All parent clades are computed by default, e.g. A|B and A, save when
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
233 they would be identical to a more specific child clade. Negative values are counted from the
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
234 bottom (right) of the hierarchy rather than the top. The special value of 0 deactivates
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
235 hierarchy calculation.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
236
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
237 >>> aastrMetadata = [[t.strip( ) for t in s] for s in ("-YZ", "a1x", "b0y", "c z")]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
238 >>> aastrData = [s.split( ) for s in ( \
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
239 "- a b c", \
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
240 "A|B 1 2 3", \
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
241 "A|C 4 5 6", \
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
242 "D|E 7 8 9")]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
243 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
244 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
245 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
246 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
247 A 0.416667 0.466667 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
248 A|B 0.0833333 0.133333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
249 A|C 0.333333 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
250 D|E 0.583333 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
251
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
252 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0.01, -1, True ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
253 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
254 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
255 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
256 A|B 0.0833333 0.133333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
257 A|C 0.333333 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
258 D|E 0.583333 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
259
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
260 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
261 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
262 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
263 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
264 A|B 0.0833333 0.133333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
265 A|C 0.333333 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
266 D|E 0.583333 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
267
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
268 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, 1, False ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
269 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
270 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
271 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
272 A 0.416667 0.466667 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
273 D 0.583333 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
274
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
275 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", [], 0, -1, True ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
276 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
277 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
278 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
279 A|B 0.0833333 0.133333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
280 A|C 0.333333 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
281 D|E 0.583333 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
282
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
283 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
284 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
285 Y 1 0
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
286 Z x y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
287 A|B 1 2 3
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
288 A|C 4 5 6
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
289 D|E 7 8 9
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
290
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
291 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "-", [], 0.8, 0, True ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
292 sample b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
293 Y 0 -
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
294 Z y z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
295 A|B 0.133333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
296 A|C 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
297 D|E 0.533333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
298
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
299 >>> merge_metadata( None, aastrData, sys.stdout, False, "", [], 0, 0, True ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
300 sample a b c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
301 A|B 1 2 3
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
302 A|C 4 5 6
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
303 D|E 7 8 9
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
304
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
305 >>> merge_metadata( aastrMetadata, aastrData, sys.stdout, True, "", ["b"], 0.01, -1, False ) #doctest: +NORMALIZE_WHITESPACE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
306 sample a c
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
307 Y 1
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
308 Z x z
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
309 A 0.416667 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
310 A|B 0.0833333 0.166667
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
311 A|C 0.333333 0.333333
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
312 D|E 0.583333 0.5
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
313 """
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
314
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
315 #Put metadata in a dictionary
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
316 #{"First line element",["line element 2","line element 3","line element 4"]}
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
317 #If there is no metadata then
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
318 astrMetadata = None
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
319 hashMetadata = {}
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
320 for astrLine in ( aastrMetadata or [] ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
321 if astrMetadata:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
322 hashMetadata[astrLine[0]] = astrLine[1:]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
323 else:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
324 astrMetadata = astrLine[1:]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
325
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
326 astrHeaders = adSeqs = iCol = None
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
327 pTree = CClade( )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
328 aastrRaw = []
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
329 for astrLine in aastrData:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
330 if astrHeaders:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
331 if ( astrLine[0] == "EWEIGHT" ) or ( astrLine[0] == "total" ) or \
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
332 ( len( astrLine ) < 2 ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
333 continue
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
334 try:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
335 adCounts = [( float(strCur) if len( strCur.strip( ) ) else 0 ) for
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
336 strCur in astrLine[iCol:]]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
337 except ValueError:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
338 aastrRaw.append( astrLine )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
339 continue
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
340 for i in range( len( adCounts ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
341 adSeqs[i] += adCounts[i]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
342 if ( iCol > 1 ) and ( astrLine[0] != astrLine[1] ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
343 if astrLine[1].find( astrLine[0] ) >= 0:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
344 astrLine[0] = astrLine[1]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
345 else:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
346 astrLine[0] += " " + astrLine[1]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
347 pTree.get( astrLine[0].split( "|" ) ).set( adCounts )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
348 else:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
349 iCol = 2 if ( astrLine[1].upper( ) == "NAME" ) else 1
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
350 astrHeaders = [strCur.replace( " ", "_" ) for strCur in astrLine[iCol:]]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
351 adSeqs = [0] * len( astrHeaders )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
352
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
353 if iTaxa:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
354 pTree.impute( )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
355 hashFeatures = {}
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
356 pTree.freeze( hashFeatures, iTaxa, fLeaves )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
357 setstrFeatures = hashFeatures.keys( )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
358
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
359 afOmit = [False] * len( astrHeaders )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
360 if dMin > 0:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
361 aadData = list(hashFeatures.values( ))
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
362 for i in range( len( astrHeaders ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
363 iMax = max( range( len( aadData ) ), key = lambda j: aadData[j][i] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
364 dMaxUs = aadData[iMax][i]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
365 dMaxThem = max( aadData[iMax][j] for j in ( range( i ) + range( i + 1, len( astrHeaders ) ) ) )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
366 if dMaxUs < ( dMin * dMaxThem ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
367 sys.stderr.write( "Omitting: %s\n" % astrHeaders[i] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
368 afOmit[i] = True
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
369
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
370 if astrExclude:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
371 setstrExclude = set(s.strip( ) for s in astrExclude)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
372 for i in range( len( astrHeaders ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
373 if ( not afOmit[i] ) and ( astrHeaders[i] in setstrExclude ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
374 afOmit[i] = True
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
375
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
376 adMult = [( ( c_dTarget / d ) if ( fNormalize and ( d > 0 ) ) else 1 ) for d in adSeqs]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
377 for strFeature, adCounts in hashFeatures.items( ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
378 for i in range( len( adCounts ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
379 if adCounts[i]:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
380 adCounts[i] *= adMult[i]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
381 if c_fRound:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
382 adCounts[i] = round( adCounts[i] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
383 for strFeature, adCounts in hashFeatures.items( ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
384 astrFeature = strFeature.strip( ).split( "|" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
385 while len( astrFeature ) > 1:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
386 astrFeature = astrFeature[:-1]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
387 strParent = "|".join( astrFeature )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
388 adParent = hashFeatures.get( strParent )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
389 if adParent == adCounts:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
390 del hashFeatures[strParent]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
391 setstrFeatures.remove( strParent )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
392
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
393 if astrMetadata:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
394 for i in range( len( astrMetadata ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
395 hashFeatures[astrMetadata[i]] = astrCur = []
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
396 for strSubject in astrHeaders:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
397 astrSubject = hashMetadata.get( strSubject )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
398 if not astrSubject:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
399 strSubject = re.sub( '_.*$', "", strSubject )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
400 astrSubject = hashMetadata.get( strSubject, [] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
401 astrCur.append( astrSubject[i] if ( i < len( astrSubject ) ) else "" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
402
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
403 astrFeatures = sorted( astrMetadata or [] ) + sorted( setstrFeatures )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
404 aiHeaders = filter( lambda i: not afOmit[i], range( len( astrHeaders ) ) )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
405 csvw = csv.writer( sys.stdout, csv.excel_tab )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
406 csvw.writerow( ["sample"] + [astrHeaders[i] for i in aiHeaders] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
407 for iFeature in range( len( astrFeatures ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
408 strFeature = astrFeatures[iFeature]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
409 adFeature = hashFeatures[strFeature]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
410 astrValues = [adFeature[i] for i in aiHeaders]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
411 for i in range( len( astrValues ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
412 strValue = astrValues[i]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
413 if type( strValue ) in (int, float):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
414 astrValues[i] = "%g" % astrValues[i]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
415 elif ( not strValue ) or ( ( type( strValue ) == str ) and
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
416 ( len( strValue ) == 0 ) ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
417 astrValues[i] = strMissing
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
418 csvw.writerow( [strFeature] + astrValues )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
419
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
420 for astrRaw in aastrRaw:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
421 csvw.writerow( [astrRaw[i] for i in aiHeaders] )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
422
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
423 argp = argparse.ArgumentParser( prog = "merge_metadata.py",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
424 description = "Join a data matrix with a metadata matrix, optionally normalizing and filtering it.\n\n" +
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
425 "A pipe-delimited taxonomy hierarchy can also be dynamically added or removed." )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
426 argp.add_argument( "-n", dest = "fNormalize", action = "store_false",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
427 help = "Don't normalize data values by column sums" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
428 argp.add_argument( "-s", dest = "strMissing", metavar = "missing",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
429 type = str, default = " ",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
430 help = "String representing missing metadata values" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
431 argp.add_argument( "-m", dest = "dMin", metavar = "min",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
432 type = float, default = 0.01,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
433 help = "Per-column quality control, minimum fraction of maximum value" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
434 argp.add_argument( "-t", dest = "iTaxa", metavar = "taxa",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
435 type = int, default = -1,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
436 help = "Depth of taxonomy to be computed, negative = from right, 0 = no change" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
437 argp.add_argument( "-l", dest = "fLeaves", action = "store_true",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
438 help = "Output only leaves, not complete taxonomy" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
439 argp.add_argument( "-x", dest = "istmExclude", metavar = "exclude.txt",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
440 type = file,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
441 help = "File from which sample IDs to exclude are read" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
442 argp.add_argument( "istmMetadata", metavar = "metadata.txt",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
443 type = file, nargs = "?",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
444 help = "File from which metadata is read" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
445 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
446
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
447 def _main( ):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
448 args = argp.parse_args( )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
449 merge_metadata( args.istmMetadata and csv.reader( args.istmMetadata, csv.excel_tab ),
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
450 csv.reader( sys.stdin, csv.excel_tab ), sys.stdout, args.fNormalize, args.strMissing,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
451 args.istmExclude, args.dMin, args.iTaxa, args.fLeaves )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
452
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
453 if __name__ == "__main__":
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
454 _main( )