annotate format_input.py @ 0:e7cd19afda2e draft

Lefse
author george-weingart
date Tue, 13 May 2014 21:57:00 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
george-weingart
parents:
diff changeset
1 #!/usr/bin/env python
george-weingart
parents:
diff changeset
2
george-weingart
parents:
diff changeset
3 import sys,os,argparse,pickle,re
george-weingart
parents:
diff changeset
4
george-weingart
parents:
diff changeset
5 def read_input_file(inp_file):
george-weingart
parents:
diff changeset
6 with open(inp_file) as inp:
george-weingart
parents:
diff changeset
7 return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
george-weingart
parents:
diff changeset
8
george-weingart
parents:
diff changeset
9 def transpose(data):
george-weingart
parents:
diff changeset
10 return zip(*data)
george-weingart
parents:
diff changeset
11
george-weingart
parents:
diff changeset
12 def read_params(args):
george-weingart
parents:
diff changeset
13 parser = argparse.ArgumentParser(description='LEfSe formatting modules')
george-weingart
parents:
diff changeset
14 parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.")
george-weingart
parents:
diff changeset
15 parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
george-weingart
parents:
diff changeset
16 help="the output file containing the data for LEfSe")
george-weingart
parents:
diff changeset
17 parser.add_argument('--output_table', type=str, required=False, default="",
george-weingart
parents:
diff changeset
18 help="the formatted table in txt format")
george-weingart
parents:
diff changeset
19 parser.add_argument('-f',dest="feats_dir", choices=["c","r"], type=str, default="r",
george-weingart
parents:
diff changeset
20 help="set whether the features are on rows (default) or on columns")
george-weingart
parents:
diff changeset
21 parser.add_argument('-c',dest="class", metavar="[1..n_feats]", type=int, default=1,
george-weingart
parents:
diff changeset
22 help="set which feature use as class (default 1)")
george-weingart
parents:
diff changeset
23 parser.add_argument('-s',dest="subclass", metavar="[1..n_feats]", type=int, default=None,
george-weingart
parents:
diff changeset
24 help="set which feature use as subclass (default -1 meaning no subclass)")
george-weingart
parents:
diff changeset
25 parser.add_argument('-o',dest="norm_v", metavar="float", type=float, default=-1.0,
george-weingart
parents:
diff changeset
26 help="set the normalization value (default -1.0 meaning no normalization)")
george-weingart
parents:
diff changeset
27 parser.add_argument('-u',dest="subject", metavar="[1..n_feats]", type=int, default=None,
george-weingart
parents:
diff changeset
28 help="set which feature use as subject (default -1 meaning no subject)")
george-weingart
parents:
diff changeset
29 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d",
george-weingart
parents:
diff changeset
30 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")
george-weingart
parents:
diff changeset
31 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10,
george-weingart
parents:
diff changeset
32 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")
george-weingart
parents:
diff changeset
33
george-weingart
parents:
diff changeset
34 args = parser.parse_args()
george-weingart
parents:
diff changeset
35
george-weingart
parents:
diff changeset
36 return vars(args)
george-weingart
parents:
diff changeset
37
george-weingart
parents:
diff changeset
38 def remove_missing(data,roc):
george-weingart
parents:
diff changeset
39 if roc == "c": data = transpose(data)
george-weingart
parents:
diff changeset
40 max_len = max([len(r) for r in data])
george-weingart
parents:
diff changeset
41 to_rem = []
george-weingart
parents:
diff changeset
42 for i,r in enumerate(data):
george-weingart
parents:
diff changeset
43 if len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i)
george-weingart
parents:
diff changeset
44 if len(to_rem):
george-weingart
parents:
diff changeset
45 for i in to_rem.reverse():
george-weingart
parents:
diff changeset
46 data.pop(i)
george-weingart
parents:
diff changeset
47 if roc == "c": return transpose(data)
george-weingart
parents:
diff changeset
48 return data
george-weingart
parents:
diff changeset
49
george-weingart
parents:
diff changeset
50
george-weingart
parents:
diff changeset
51 def sort_by_cl(data,n,c,s,u):
george-weingart
parents:
diff changeset
52 def sort_lines1(a,b):
george-weingart
parents:
diff changeset
53 return int(a[c] > b[c])*2-1
george-weingart
parents:
diff changeset
54 def sort_lines2u(a,b):
george-weingart
parents:
diff changeset
55 if a[c] != b[c]: return int(a[c] > b[c])*2-1
george-weingart
parents:
diff changeset
56 return int(a[u] > b[u])*2-1
george-weingart
parents:
diff changeset
57 def sort_lines2s(a,b):
george-weingart
parents:
diff changeset
58 if a[c] != b[c]: return int(a[c] > b[c])*2-1
george-weingart
parents:
diff changeset
59 return int(a[s] > b[s])*2-1
george-weingart
parents:
diff changeset
60 def sort_lines3(a,b):
george-weingart
parents:
diff changeset
61 if a[c] != b[c]: return int(a[c] > b[c])*2-1
george-weingart
parents:
diff changeset
62 if a[s] != b[s]: return int(a[s] > b[s])*2-1
george-weingart
parents:
diff changeset
63 return int(a[u] > b[u])*2-1
george-weingart
parents:
diff changeset
64 if n == 3: data.sort(sort_lines3)
george-weingart
parents:
diff changeset
65 if n == 2:
george-weingart
parents:
diff changeset
66 if s is None:
george-weingart
parents:
diff changeset
67 data.sort(sort_lines2u)
george-weingart
parents:
diff changeset
68 else:
george-weingart
parents:
diff changeset
69 data.sort(sort_lines2s)
george-weingart
parents:
diff changeset
70 if n == 1: data.sort(sort_lines1)
george-weingart
parents:
diff changeset
71 return data
george-weingart
parents:
diff changeset
72
george-weingart
parents:
diff changeset
73 def group_small_subclasses(cls,min_subcl):
george-weingart
parents:
diff changeset
74 last = ""
george-weingart
parents:
diff changeset
75 n = 0
george-weingart
parents:
diff changeset
76 repl = []
george-weingart
parents:
diff changeset
77 dd = [list(cls['class']),list(cls['subclass'])]
george-weingart
parents:
diff changeset
78 for d in dd:
george-weingart
parents:
diff changeset
79 if d[1] != last:
george-weingart
parents:
diff changeset
80 if n < min_subcl and last != "":
george-weingart
parents:
diff changeset
81 repl.append(d[1])
george-weingart
parents:
diff changeset
82 last = d[1]
george-weingart
parents:
diff changeset
83 n = 1
george-weingart
parents:
diff changeset
84 for i,d in enumerate(dd):
george-weingart
parents:
diff changeset
85 if d[1] in repl: dd[i][1] = "other"
george-weingart
parents:
diff changeset
86 dd[i][1] = str(dd[i][0])+"_"+str(dd[i][1])
george-weingart
parents:
diff changeset
87 cls['class'] = dd[0]
george-weingart
parents:
diff changeset
88 cls['subclass'] = dd[1]
george-weingart
parents:
diff changeset
89 return cls
george-weingart
parents:
diff changeset
90
george-weingart
parents:
diff changeset
91 def get_class_slices(data):
george-weingart
parents:
diff changeset
92 previous_class = data[0][0]
george-weingart
parents:
diff changeset
93 previous_subclass = data[0][1]
george-weingart
parents:
diff changeset
94 subclass_slices = []
george-weingart
parents:
diff changeset
95 class_slices = []
george-weingart
parents:
diff changeset
96 last_cl = 0
george-weingart
parents:
diff changeset
97 last_subcl = 0
george-weingart
parents:
diff changeset
98 class_hierarchy = []
george-weingart
parents:
diff changeset
99 subcls = []
george-weingart
parents:
diff changeset
100 for i,d in enumerate(data):
george-weingart
parents:
diff changeset
101 if d[1] != previous_subclass:
george-weingart
parents:
diff changeset
102 subclass_slices.append((previous_subclass,(last_subcl,i)))
george-weingart
parents:
diff changeset
103 last_subcl = i
george-weingart
parents:
diff changeset
104 subcls.append(previous_subclass)
george-weingart
parents:
diff changeset
105 if d[0] != previous_class:
george-weingart
parents:
diff changeset
106 class_slices.append((previous_class,(last_cl,i)))
george-weingart
parents:
diff changeset
107 class_hierarchy.append((previous_class,subcls))
george-weingart
parents:
diff changeset
108 subcls = []
george-weingart
parents:
diff changeset
109 last_cl = i
george-weingart
parents:
diff changeset
110 previous_subclass = d[1]
george-weingart
parents:
diff changeset
111 previous_class = d[0]
george-weingart
parents:
diff changeset
112 subclass_slices.append((previous_subclass,(last_subcl,i+1)))
george-weingart
parents:
diff changeset
113 subcls.append(previous_subclass)
george-weingart
parents:
diff changeset
114 class_slices.append((previous_class,(last_cl,i+1)))
george-weingart
parents:
diff changeset
115 class_hierarchy.append((previous_class,subcls))
george-weingart
parents:
diff changeset
116 return dict(class_slices), dict(subclass_slices), dict(class_hierarchy)
george-weingart
parents:
diff changeset
117
george-weingart
parents:
diff changeset
118 def numerical_values(feats,norm):
george-weingart
parents:
diff changeset
119 mm = []
george-weingart
parents:
diff changeset
120 for k,v in feats.items():
george-weingart
parents:
diff changeset
121 feats[k] = [float(val) for val in v]
george-weingart
parents:
diff changeset
122 if norm < 0.0: return feats
george-weingart
parents:
diff changeset
123 tr = zip(*(feats.values()))
george-weingart
parents:
diff changeset
124 mul = []
george-weingart
parents:
diff changeset
125 fk = feats.keys()
george-weingart
parents:
diff changeset
126 hie = True if sum([k.count(".") for k in fk]) > len(fk) else False
george-weingart
parents:
diff changeset
127 for i in range(len(feats.values()[0])):
george-weingart
parents:
diff changeset
128 if hie: mul.append(sum([t for j,t in enumerate(tr[i]) if fk[j].count(".") < 1 ]))
george-weingart
parents:
diff changeset
129 else: mul.append(sum(tr[i]))
george-weingart
parents:
diff changeset
130 if hie and sum(mul) == 0:
george-weingart
parents:
diff changeset
131 mul = []
george-weingart
parents:
diff changeset
132 for i in range(len(feats.values()[0])):
george-weingart
parents:
diff changeset
133 mul.append(sum(tr[i]))
george-weingart
parents:
diff changeset
134 for i,m in enumerate(mul):
george-weingart
parents:
diff changeset
135 if m == 0: mul[i] = 0.0
george-weingart
parents:
diff changeset
136 else: mul[i] = float(norm) / m
george-weingart
parents:
diff changeset
137 for k,v in feats.items():
george-weingart
parents:
diff changeset
138 feats[k] = [val*mul[i] for i,val in enumerate(v)]
george-weingart
parents:
diff changeset
139 return feats
george-weingart
parents:
diff changeset
140
george-weingart
parents:
diff changeset
141 def add_missing_levels2(ff):
george-weingart
parents:
diff changeset
142
george-weingart
parents:
diff changeset
143 if sum( [f.count(".") for f in ff] ) < 1: return ff
george-weingart
parents:
diff changeset
144
george-weingart
parents:
diff changeset
145 dn = {}
george-weingart
parents:
diff changeset
146
george-weingart
parents:
diff changeset
147 added = True
george-weingart
parents:
diff changeset
148 while added:
george-weingart
parents:
diff changeset
149 added = False
george-weingart
parents:
diff changeset
150 for f in ff:
george-weingart
parents:
diff changeset
151 lev = f.count(".")
george-weingart
parents:
diff changeset
152 if lev == 0: continue
george-weingart
parents:
diff changeset
153 if lev not in dn: dn[lev] = [f]
george-weingart
parents:
diff changeset
154 else: dn[lev].append(f)
george-weingart
parents:
diff changeset
155 for fn in sorted(dn,reverse=True):
george-weingart
parents:
diff changeset
156 for f in dn[fn]:
george-weingart
parents:
diff changeset
157 fc = ".".join(f.split('.')[:-1])
george-weingart
parents:
diff changeset
158 if fc not in ff:
george-weingart
parents:
diff changeset
159 ab_all = [ff[fg] for fg in ff if (fg.count(".") == 0 and fg == fc) or (fg.count(".") > 0 and fc == ".".join(fg.split('.')[:-1]))]
george-weingart
parents:
diff changeset
160 ab =[]
george-weingart
parents:
diff changeset
161 for l in [f for f in zip(*ab_all)]:
george-weingart
parents:
diff changeset
162 ab.append(sum([float(ll) for ll in l]))
george-weingart
parents:
diff changeset
163 ff[fc] = ab
george-weingart
parents:
diff changeset
164 added = True
george-weingart
parents:
diff changeset
165 if added:
george-weingart
parents:
diff changeset
166 break
george-weingart
parents:
diff changeset
167
george-weingart
parents:
diff changeset
168 return ff
george-weingart
parents:
diff changeset
169
george-weingart
parents:
diff changeset
170
george-weingart
parents:
diff changeset
171 def add_missing_levels(ff):
george-weingart
parents:
diff changeset
172 if sum( [f.count(".") for f in ff] ) < 1: return ff
george-weingart
parents:
diff changeset
173
george-weingart
parents:
diff changeset
174 clades2leaves = {}
george-weingart
parents:
diff changeset
175 for f in ff:
george-weingart
parents:
diff changeset
176 fs = f.split(".")
george-weingart
parents:
diff changeset
177 if len(fs) < 2:
george-weingart
parents:
diff changeset
178 continue
george-weingart
parents:
diff changeset
179 for l in range(len(fs)):
george-weingart
parents:
diff changeset
180 n = ".".join( fs[:l] )
george-weingart
parents:
diff changeset
181 if n in clades2leaves:
george-weingart
parents:
diff changeset
182 clades2leaves[n].append( f )
george-weingart
parents:
diff changeset
183 else:
george-weingart
parents:
diff changeset
184 clades2leaves[n] = [f]
george-weingart
parents:
diff changeset
185 for k,v in clades2leaves.items():
george-weingart
parents:
diff changeset
186 if k and k not in ff:
george-weingart
parents:
diff changeset
187 ff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])]
george-weingart
parents:
diff changeset
188 return ff
george-weingart
parents:
diff changeset
189
george-weingart
parents:
diff changeset
190
george-weingart
parents:
diff changeset
191
george-weingart
parents:
diff changeset
192 def modify_feature_names(fn):
george-weingart
parents:
diff changeset
193 ret = fn
george-weingart
parents:
diff changeset
194
george-weingart
parents:
diff changeset
195 for v in [' ',r'\$',r'\@',r'#',r'%',r'\^',r'\&',r'\*',r'\"',r'\'']:
george-weingart
parents:
diff changeset
196 ret = [re.sub(v,"",f) for f in ret]
george-weingart
parents:
diff changeset
197 for v in ["/",r'\(',r'\)',r'-',r'\+',r'=',r'{',r'}',r'\[',r'\]',
george-weingart
parents:
diff changeset
198 r',',r'\.',r';',r':',r'\?',r'\<',r'\>',r'\.',r'\,']:
george-weingart
parents:
diff changeset
199 ret = [re.sub(v,"_",f) for f in ret]
george-weingart
parents:
diff changeset
200 for v in ["\|"]:
george-weingart
parents:
diff changeset
201 ret = [re.sub(v,".",f) for f in ret]
george-weingart
parents:
diff changeset
202
george-weingart
parents:
diff changeset
203 ret2 = []
george-weingart
parents:
diff changeset
204 for r in ret:
george-weingart
parents:
diff changeset
205 if r[0] in ['0','1','2','3','4','5','6','7','8','9']:
george-weingart
parents:
diff changeset
206 ret2.append("f_"+r)
george-weingart
parents:
diff changeset
207 else: ret2.append(r)
george-weingart
parents:
diff changeset
208
george-weingart
parents:
diff changeset
209 return ret2
george-weingart
parents:
diff changeset
210
george-weingart
parents:
diff changeset
211
george-weingart
parents:
diff changeset
212 def rename_same_subcl(cl,subcl):
george-weingart
parents:
diff changeset
213 toc = []
george-weingart
parents:
diff changeset
214 for sc in set(subcl):
george-weingart
parents:
diff changeset
215 if len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1:
george-weingart
parents:
diff changeset
216 toc.append(sc)
george-weingart
parents:
diff changeset
217 new_subcl = []
george-weingart
parents:
diff changeset
218 for i,sc in enumerate(subcl):
george-weingart
parents:
diff changeset
219 if sc in toc: new_subcl.append(cl[i]+"_"+sc)
george-weingart
parents:
diff changeset
220 else: new_subcl.append(sc)
george-weingart
parents:
diff changeset
221 return new_subcl
george-weingart
parents:
diff changeset
222
george-weingart
parents:
diff changeset
223 if __name__ == '__main__':
george-weingart
parents:
diff changeset
224 params = read_params(sys.argv)
george-weingart
parents:
diff changeset
225
george-weingart
parents:
diff changeset
226 if type(params['subclass']) is int and int(params['subclass']) < 1:
george-weingart
parents:
diff changeset
227 params['subclass'] = None
george-weingart
parents:
diff changeset
228 if type(params['subject']) is int and int(params['subject']) < 1:
george-weingart
parents:
diff changeset
229 params['subject'] = None
george-weingart
parents:
diff changeset
230 data = read_input_file(sys.argv[1])
george-weingart
parents:
diff changeset
231
george-weingart
parents:
diff changeset
232 if params['feats_dir'] == "c":
george-weingart
parents:
diff changeset
233 data = transpose(data)
george-weingart
parents:
diff changeset
234
george-weingart
parents:
diff changeset
235 ncl = 1
george-weingart
parents:
diff changeset
236 if not params['subclass'] is None: ncl += 1
george-weingart
parents:
diff changeset
237 if not params['subject'] is None: ncl += 1
george-weingart
parents:
diff changeset
238
george-weingart
parents:
diff changeset
239 first_line = zip(*data)[0]
george-weingart
parents:
diff changeset
240
george-weingart
parents:
diff changeset
241 first_line = modify_feature_names(list(first_line))
george-weingart
parents:
diff changeset
242
george-weingart
parents:
diff changeset
243 data = zip( first_line,
george-weingart
parents:
diff changeset
244 *sort_by_cl(zip(*data)[1:],
george-weingart
parents:
diff changeset
245 ncl,
george-weingart
parents:
diff changeset
246 params['class']-1,
george-weingart
parents:
diff changeset
247 params['subclass']-1 if not params['subclass'] is None else None,
george-weingart
parents:
diff changeset
248 params['subject']-1 if not params['subject'] is None else None))
george-weingart
parents:
diff changeset
249 # data.insert(0,first_line)
george-weingart
parents:
diff changeset
250 # data = remove_missing(data,params['missing_p'])
george-weingart
parents:
diff changeset
251 cls = {}
george-weingart
parents:
diff changeset
252
george-weingart
parents:
diff changeset
253 cls_i = [('class',params['class']-1)]
george-weingart
parents:
diff changeset
254 if params['subclass'] > 0: cls_i.append(('subclass',params['subclass']-1))
george-weingart
parents:
diff changeset
255 if params['subject'] > 0: cls_i.append(('subject',params['subject']-1))
george-weingart
parents:
diff changeset
256 cls_i.sort(lambda x, y: -cmp(x[1],y[1]))
george-weingart
parents:
diff changeset
257 for v in cls_i: cls[v[0]] = data.pop(v[1])[1:]
george-weingart
parents:
diff changeset
258 if not params['subclass'] > 0: cls['subclass'] = [str(cl)+"_subcl" for cl in cls['class']]
george-weingart
parents:
diff changeset
259
george-weingart
parents:
diff changeset
260 cls['subclass'] = rename_same_subcl(cls['class'],cls['subclass'])
george-weingart
parents:
diff changeset
261 # if 'subclass' in cls.keys(): cls = group_small_subclasses(cls,params['subcl_min_card'])
george-weingart
parents:
diff changeset
262 class_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))
george-weingart
parents:
diff changeset
263
george-weingart
parents:
diff changeset
264 feats = dict([(d[0],d[1:]) for d in data])
george-weingart
parents:
diff changeset
265
george-weingart
parents:
diff changeset
266 feats = add_missing_levels(feats)
george-weingart
parents:
diff changeset
267
george-weingart
parents:
diff changeset
268 feats = numerical_values(feats,params['norm_v'])
george-weingart
parents:
diff changeset
269 out = {}
george-weingart
parents:
diff changeset
270 out['feats'] = feats
george-weingart
parents:
diff changeset
271 out['norm'] = params['norm_v']
george-weingart
parents:
diff changeset
272 out['cls'] = cls
george-weingart
parents:
diff changeset
273 out['class_sl'] = class_sl
george-weingart
parents:
diff changeset
274 out['subclass_sl'] = subclass_sl
george-weingart
parents:
diff changeset
275 out['class_hierarchy'] = class_hierarchy
george-weingart
parents:
diff changeset
276
george-weingart
parents:
diff changeset
277 if params['output_table']:
george-weingart
parents:
diff changeset
278 with open( params['output_table'], "w") as outf:
george-weingart
parents:
diff changeset
279 if 'class' in cls: outf.write( "\t".join(list(["class"])+list(cls['class'])) + "\n" )
george-weingart
parents:
diff changeset
280 if 'subclass' in cls: outf.write( "\t".join(list(["subclass"])+list(cls['subclass'])) + "\n" )
george-weingart
parents:
diff changeset
281 if 'subject' in cls: outf.write( "\t".join(list(["subject"])+list(cls['subject'])) + "\n" )
george-weingart
parents:
diff changeset
282 for k,v in out['feats'].items(): outf.write( "\t".join([k]+[str(vv) for vv in v]) + "\n" )
george-weingart
parents:
diff changeset
283
george-weingart
parents:
diff changeset
284 with open(params['output_file'], 'wb') as back_file:
george-weingart
parents:
diff changeset
285 pickle.dump(out,back_file)
george-weingart
parents:
diff changeset
286