Mercurial > repos > george-weingart > lefse
comparison format_input.py @ 0:e7cd19afda2e draft
Lefse
author | george-weingart |
---|---|
date | Tue, 13 May 2014 21:57:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e7cd19afda2e |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys,os,argparse,pickle,re | |
4 | |
5 def read_input_file(inp_file): | |
6 with open(inp_file) as inp: | |
7 return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()] | |
8 | |
9 def transpose(data): | |
10 return zip(*data) | |
11 | |
12 def read_params(args): | |
13 parser = argparse.ArgumentParser(description='LEfSe formatting modules') | |
14 parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.") | |
15 parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, | |
16 help="the output file containing the data for LEfSe") | |
17 parser.add_argument('--output_table', type=str, required=False, default="", | |
18 help="the formatted table in txt format") | |
19 parser.add_argument('-f',dest="feats_dir", choices=["c","r"], type=str, default="r", | |
20 help="set whether the features are on rows (default) or on columns") | |
21 parser.add_argument('-c',dest="class", metavar="[1..n_feats]", type=int, default=1, | |
22 help="set which feature use as class (default 1)") | |
23 parser.add_argument('-s',dest="subclass", metavar="[1..n_feats]", type=int, default=None, | |
24 help="set which feature use as subclass (default -1 meaning no subclass)") | |
25 parser.add_argument('-o',dest="norm_v", metavar="float", type=float, default=-1.0, | |
26 help="set the normalization value (default -1.0 meaning no normalization)") | |
27 parser.add_argument('-u',dest="subject", metavar="[1..n_feats]", type=int, default=None, | |
28 help="set which feature use as subject (default -1 meaning no subject)") | |
29 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d", | |
30 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)") | |
31 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10, | |
32 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)") | |
33 | |
34 args = parser.parse_args() | |
35 | |
36 return vars(args) | |
37 | |
38 def remove_missing(data,roc): | |
39 if roc == "c": data = transpose(data) | |
40 max_len = max([len(r) for r in data]) | |
41 to_rem = [] | |
42 for i,r in enumerate(data): | |
43 if len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i) | |
44 if len(to_rem): | |
45 for i in to_rem.reverse(): | |
46 data.pop(i) | |
47 if roc == "c": return transpose(data) | |
48 return data | |
49 | |
50 | |
51 def sort_by_cl(data,n,c,s,u): | |
52 def sort_lines1(a,b): | |
53 return int(a[c] > b[c])*2-1 | |
54 def sort_lines2u(a,b): | |
55 if a[c] != b[c]: return int(a[c] > b[c])*2-1 | |
56 return int(a[u] > b[u])*2-1 | |
57 def sort_lines2s(a,b): | |
58 if a[c] != b[c]: return int(a[c] > b[c])*2-1 | |
59 return int(a[s] > b[s])*2-1 | |
60 def sort_lines3(a,b): | |
61 if a[c] != b[c]: return int(a[c] > b[c])*2-1 | |
62 if a[s] != b[s]: return int(a[s] > b[s])*2-1 | |
63 return int(a[u] > b[u])*2-1 | |
64 if n == 3: data.sort(sort_lines3) | |
65 if n == 2: | |
66 if s is None: | |
67 data.sort(sort_lines2u) | |
68 else: | |
69 data.sort(sort_lines2s) | |
70 if n == 1: data.sort(sort_lines1) | |
71 return data | |
72 | |
73 def group_small_subclasses(cls,min_subcl): | |
74 last = "" | |
75 n = 0 | |
76 repl = [] | |
77 dd = [list(cls['class']),list(cls['subclass'])] | |
78 for d in dd: | |
79 if d[1] != last: | |
80 if n < min_subcl and last != "": | |
81 repl.append(d[1]) | |
82 last = d[1] | |
83 n = 1 | |
84 for i,d in enumerate(dd): | |
85 if d[1] in repl: dd[i][1] = "other" | |
86 dd[i][1] = str(dd[i][0])+"_"+str(dd[i][1]) | |
87 cls['class'] = dd[0] | |
88 cls['subclass'] = dd[1] | |
89 return cls | |
90 | |
91 def get_class_slices(data): | |
92 previous_class = data[0][0] | |
93 previous_subclass = data[0][1] | |
94 subclass_slices = [] | |
95 class_slices = [] | |
96 last_cl = 0 | |
97 last_subcl = 0 | |
98 class_hierarchy = [] | |
99 subcls = [] | |
100 for i,d in enumerate(data): | |
101 if d[1] != previous_subclass: | |
102 subclass_slices.append((previous_subclass,(last_subcl,i))) | |
103 last_subcl = i | |
104 subcls.append(previous_subclass) | |
105 if d[0] != previous_class: | |
106 class_slices.append((previous_class,(last_cl,i))) | |
107 class_hierarchy.append((previous_class,subcls)) | |
108 subcls = [] | |
109 last_cl = i | |
110 previous_subclass = d[1] | |
111 previous_class = d[0] | |
112 subclass_slices.append((previous_subclass,(last_subcl,i+1))) | |
113 subcls.append(previous_subclass) | |
114 class_slices.append((previous_class,(last_cl,i+1))) | |
115 class_hierarchy.append((previous_class,subcls)) | |
116 return dict(class_slices), dict(subclass_slices), dict(class_hierarchy) | |
117 | |
118 def numerical_values(feats,norm): | |
119 mm = [] | |
120 for k,v in feats.items(): | |
121 feats[k] = [float(val) for val in v] | |
122 if norm < 0.0: return feats | |
123 tr = zip(*(feats.values())) | |
124 mul = [] | |
125 fk = feats.keys() | |
126 hie = True if sum([k.count(".") for k in fk]) > len(fk) else False | |
127 for i in range(len(feats.values()[0])): | |
128 if hie: mul.append(sum([t for j,t in enumerate(tr[i]) if fk[j].count(".") < 1 ])) | |
129 else: mul.append(sum(tr[i])) | |
130 if hie and sum(mul) == 0: | |
131 mul = [] | |
132 for i in range(len(feats.values()[0])): | |
133 mul.append(sum(tr[i])) | |
134 for i,m in enumerate(mul): | |
135 if m == 0: mul[i] = 0.0 | |
136 else: mul[i] = float(norm) / m | |
137 for k,v in feats.items(): | |
138 feats[k] = [val*mul[i] for i,val in enumerate(v)] | |
139 return feats | |
140 | |
141 def add_missing_levels2(ff): | |
142 | |
143 if sum( [f.count(".") for f in ff] ) < 1: return ff | |
144 | |
145 dn = {} | |
146 | |
147 added = True | |
148 while added: | |
149 added = False | |
150 for f in ff: | |
151 lev = f.count(".") | |
152 if lev == 0: continue | |
153 if lev not in dn: dn[lev] = [f] | |
154 else: dn[lev].append(f) | |
155 for fn in sorted(dn,reverse=True): | |
156 for f in dn[fn]: | |
157 fc = ".".join(f.split('.')[:-1]) | |
158 if fc not in ff: | |
159 ab_all = [ff[fg] for fg in ff if (fg.count(".") == 0 and fg == fc) or (fg.count(".") > 0 and fc == ".".join(fg.split('.')[:-1]))] | |
160 ab =[] | |
161 for l in [f for f in zip(*ab_all)]: | |
162 ab.append(sum([float(ll) for ll in l])) | |
163 ff[fc] = ab | |
164 added = True | |
165 if added: | |
166 break | |
167 | |
168 return ff | |
169 | |
170 | |
171 def add_missing_levels(ff): | |
172 if sum( [f.count(".") for f in ff] ) < 1: return ff | |
173 | |
174 clades2leaves = {} | |
175 for f in ff: | |
176 fs = f.split(".") | |
177 if len(fs) < 2: | |
178 continue | |
179 for l in range(len(fs)): | |
180 n = ".".join( fs[:l] ) | |
181 if n in clades2leaves: | |
182 clades2leaves[n].append( f ) | |
183 else: | |
184 clades2leaves[n] = [f] | |
185 for k,v in clades2leaves.items(): | |
186 if k and k not in ff: | |
187 ff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])] | |
188 return ff | |
189 | |
190 | |
191 | |
192 def modify_feature_names(fn): | |
193 ret = fn | |
194 | |
195 for v in [' ',r'\$',r'\@',r'#',r'%',r'\^',r'\&',r'\*',r'\"',r'\'']: | |
196 ret = [re.sub(v,"",f) for f in ret] | |
197 for v in ["/",r'\(',r'\)',r'-',r'\+',r'=',r'{',r'}',r'\[',r'\]', | |
198 r',',r'\.',r';',r':',r'\?',r'\<',r'\>',r'\.',r'\,']: | |
199 ret = [re.sub(v,"_",f) for f in ret] | |
200 for v in ["\|"]: | |
201 ret = [re.sub(v,".",f) for f in ret] | |
202 | |
203 ret2 = [] | |
204 for r in ret: | |
205 if r[0] in ['0','1','2','3','4','5','6','7','8','9']: | |
206 ret2.append("f_"+r) | |
207 else: ret2.append(r) | |
208 | |
209 return ret2 | |
210 | |
211 | |
212 def rename_same_subcl(cl,subcl): | |
213 toc = [] | |
214 for sc in set(subcl): | |
215 if len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1: | |
216 toc.append(sc) | |
217 new_subcl = [] | |
218 for i,sc in enumerate(subcl): | |
219 if sc in toc: new_subcl.append(cl[i]+"_"+sc) | |
220 else: new_subcl.append(sc) | |
221 return new_subcl | |
222 | |
223 if __name__ == '__main__': | |
224 params = read_params(sys.argv) | |
225 | |
226 if type(params['subclass']) is int and int(params['subclass']) < 1: | |
227 params['subclass'] = None | |
228 if type(params['subject']) is int and int(params['subject']) < 1: | |
229 params['subject'] = None | |
230 data = read_input_file(sys.argv[1]) | |
231 | |
232 if params['feats_dir'] == "c": | |
233 data = transpose(data) | |
234 | |
235 ncl = 1 | |
236 if not params['subclass'] is None: ncl += 1 | |
237 if not params['subject'] is None: ncl += 1 | |
238 | |
239 first_line = zip(*data)[0] | |
240 | |
241 first_line = modify_feature_names(list(first_line)) | |
242 | |
243 data = zip( first_line, | |
244 *sort_by_cl(zip(*data)[1:], | |
245 ncl, | |
246 params['class']-1, | |
247 params['subclass']-1 if not params['subclass'] is None else None, | |
248 params['subject']-1 if not params['subject'] is None else None)) | |
249 # data.insert(0,first_line) | |
250 # data = remove_missing(data,params['missing_p']) | |
251 cls = {} | |
252 | |
253 cls_i = [('class',params['class']-1)] | |
254 if params['subclass'] > 0: cls_i.append(('subclass',params['subclass']-1)) | |
255 if params['subject'] > 0: cls_i.append(('subject',params['subject']-1)) | |
256 cls_i.sort(lambda x, y: -cmp(x[1],y[1])) | |
257 for v in cls_i: cls[v[0]] = data.pop(v[1])[1:] | |
258 if not params['subclass'] > 0: cls['subclass'] = [str(cl)+"_subcl" for cl in cls['class']] | |
259 | |
260 cls['subclass'] = rename_same_subcl(cls['class'],cls['subclass']) | |
261 # if 'subclass' in cls.keys(): cls = group_small_subclasses(cls,params['subcl_min_card']) | |
262 class_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values())) | |
263 | |
264 feats = dict([(d[0],d[1:]) for d in data]) | |
265 | |
266 feats = add_missing_levels(feats) | |
267 | |
268 feats = numerical_values(feats,params['norm_v']) | |
269 out = {} | |
270 out['feats'] = feats | |
271 out['norm'] = params['norm_v'] | |
272 out['cls'] = cls | |
273 out['class_sl'] = class_sl | |
274 out['subclass_sl'] = subclass_sl | |
275 out['class_hierarchy'] = class_hierarchy | |
276 | |
277 if params['output_table']: | |
278 with open( params['output_table'], "w") as outf: | |
279 if 'class' in cls: outf.write( "\t".join(list(["class"])+list(cls['class'])) + "\n" ) | |
280 if 'subclass' in cls: outf.write( "\t".join(list(["subclass"])+list(cls['subclass'])) + "\n" ) | |
281 if 'subject' in cls: outf.write( "\t".join(list(["subject"])+list(cls['subject'])) + "\n" ) | |
282 for k,v in out['feats'].items(): outf.write( "\t".join([k]+[str(vv) for vv in v]) + "\n" ) | |
283 | |
284 with open(params['output_file'], 'wb') as back_file: | |
285 pickle.dump(out,back_file) | |
286 |