0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys,os,argparse,pickle,re
|
|
4
|
|
5 def read_input_file(inp_file):
|
|
6 with open(inp_file) as inp:
|
|
7 return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
|
|
8
|
|
9 def transpose(data):
|
|
10 return zip(*data)
|
|
11
|
|
12 def read_params(args):
|
|
13 parser = argparse.ArgumentParser(description='LEfSe formatting modules')
|
|
14 parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.")
|
|
15 parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
|
|
16 help="the output file containing the data for LEfSe")
|
|
17 parser.add_argument('--output_table', type=str, required=False, default="",
|
|
18 help="the formatted table in txt format")
|
|
19 parser.add_argument('-f',dest="feats_dir", choices=["c","r"], type=str, default="r",
|
|
20 help="set whether the features are on rows (default) or on columns")
|
|
21 parser.add_argument('-c',dest="class", metavar="[1..n_feats]", type=int, default=1,
|
|
22 help="set which feature use as class (default 1)")
|
|
23 parser.add_argument('-s',dest="subclass", metavar="[1..n_feats]", type=int, default=None,
|
|
24 help="set which feature use as subclass (default -1 meaning no subclass)")
|
|
25 parser.add_argument('-o',dest="norm_v", metavar="float", type=float, default=-1.0,
|
|
26 help="set the normalization value (default -1.0 meaning no normalization)")
|
|
27 parser.add_argument('-u',dest="subject", metavar="[1..n_feats]", type=int, default=None,
|
|
28 help="set which feature use as subject (default -1 meaning no subject)")
|
|
29 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d",
|
|
30 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")
|
|
31 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10,
|
|
32 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")
|
|
33
|
|
34 args = parser.parse_args()
|
|
35
|
|
36 return vars(args)
|
|
37
|
|
38 def remove_missing(data,roc):
|
|
39 if roc == "c": data = transpose(data)
|
|
40 max_len = max([len(r) for r in data])
|
|
41 to_rem = []
|
|
42 for i,r in enumerate(data):
|
|
43 if len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i)
|
|
44 if len(to_rem):
|
|
45 for i in to_rem.reverse():
|
|
46 data.pop(i)
|
|
47 if roc == "c": return transpose(data)
|
|
48 return data
|
|
49
|
|
50
|
|
51 def sort_by_cl(data,n,c,s,u):
|
|
52 def sort_lines1(a,b):
|
|
53 return int(a[c] > b[c])*2-1
|
|
54 def sort_lines2u(a,b):
|
|
55 if a[c] != b[c]: return int(a[c] > b[c])*2-1
|
|
56 return int(a[u] > b[u])*2-1
|
|
57 def sort_lines2s(a,b):
|
|
58 if a[c] != b[c]: return int(a[c] > b[c])*2-1
|
|
59 return int(a[s] > b[s])*2-1
|
|
60 def sort_lines3(a,b):
|
|
61 if a[c] != b[c]: return int(a[c] > b[c])*2-1
|
|
62 if a[s] != b[s]: return int(a[s] > b[s])*2-1
|
|
63 return int(a[u] > b[u])*2-1
|
|
64 if n == 3: data.sort(sort_lines3)
|
|
65 if n == 2:
|
|
66 if s is None:
|
|
67 data.sort(sort_lines2u)
|
|
68 else:
|
|
69 data.sort(sort_lines2s)
|
|
70 if n == 1: data.sort(sort_lines1)
|
|
71 return data
|
|
72
|
|
73 def group_small_subclasses(cls,min_subcl):
|
|
74 last = ""
|
|
75 n = 0
|
|
76 repl = []
|
|
77 dd = [list(cls['class']),list(cls['subclass'])]
|
|
78 for d in dd:
|
|
79 if d[1] != last:
|
|
80 if n < min_subcl and last != "":
|
|
81 repl.append(d[1])
|
|
82 last = d[1]
|
|
83 n = 1
|
|
84 for i,d in enumerate(dd):
|
|
85 if d[1] in repl: dd[i][1] = "other"
|
|
86 dd[i][1] = str(dd[i][0])+"_"+str(dd[i][1])
|
|
87 cls['class'] = dd[0]
|
|
88 cls['subclass'] = dd[1]
|
|
89 return cls
|
|
90
|
|
91 def get_class_slices(data):
|
|
92 previous_class = data[0][0]
|
|
93 previous_subclass = data[0][1]
|
|
94 subclass_slices = []
|
|
95 class_slices = []
|
|
96 last_cl = 0
|
|
97 last_subcl = 0
|
|
98 class_hierarchy = []
|
|
99 subcls = []
|
|
100 for i,d in enumerate(data):
|
|
101 if d[1] != previous_subclass:
|
|
102 subclass_slices.append((previous_subclass,(last_subcl,i)))
|
|
103 last_subcl = i
|
|
104 subcls.append(previous_subclass)
|
|
105 if d[0] != previous_class:
|
|
106 class_slices.append((previous_class,(last_cl,i)))
|
|
107 class_hierarchy.append((previous_class,subcls))
|
|
108 subcls = []
|
|
109 last_cl = i
|
|
110 previous_subclass = d[1]
|
|
111 previous_class = d[0]
|
|
112 subclass_slices.append((previous_subclass,(last_subcl,i+1)))
|
|
113 subcls.append(previous_subclass)
|
|
114 class_slices.append((previous_class,(last_cl,i+1)))
|
|
115 class_hierarchy.append((previous_class,subcls))
|
|
116 return dict(class_slices), dict(subclass_slices), dict(class_hierarchy)
|
|
117
|
|
118 def numerical_values(feats,norm):
|
|
119 mm = []
|
|
120 for k,v in feats.items():
|
|
121 feats[k] = [float(val) for val in v]
|
|
122 if norm < 0.0: return feats
|
|
123 tr = zip(*(feats.values()))
|
|
124 mul = []
|
|
125 fk = feats.keys()
|
|
126 hie = True if sum([k.count(".") for k in fk]) > len(fk) else False
|
|
127 for i in range(len(feats.values()[0])):
|
|
128 if hie: mul.append(sum([t for j,t in enumerate(tr[i]) if fk[j].count(".") < 1 ]))
|
|
129 else: mul.append(sum(tr[i]))
|
|
130 if hie and sum(mul) == 0:
|
|
131 mul = []
|
|
132 for i in range(len(feats.values()[0])):
|
|
133 mul.append(sum(tr[i]))
|
|
134 for i,m in enumerate(mul):
|
|
135 if m == 0: mul[i] = 0.0
|
|
136 else: mul[i] = float(norm) / m
|
|
137 for k,v in feats.items():
|
|
138 feats[k] = [val*mul[i] for i,val in enumerate(v)]
|
|
139 return feats
|
|
140
|
|
141 def add_missing_levels2(ff):
|
|
142
|
|
143 if sum( [f.count(".") for f in ff] ) < 1: return ff
|
|
144
|
|
145 dn = {}
|
|
146
|
|
147 added = True
|
|
148 while added:
|
|
149 added = False
|
|
150 for f in ff:
|
|
151 lev = f.count(".")
|
|
152 if lev == 0: continue
|
|
153 if lev not in dn: dn[lev] = [f]
|
|
154 else: dn[lev].append(f)
|
|
155 for fn in sorted(dn,reverse=True):
|
|
156 for f in dn[fn]:
|
|
157 fc = ".".join(f.split('.')[:-1])
|
|
158 if fc not in ff:
|
|
159 ab_all = [ff[fg] for fg in ff if (fg.count(".") == 0 and fg == fc) or (fg.count(".") > 0 and fc == ".".join(fg.split('.')[:-1]))]
|
|
160 ab =[]
|
|
161 for l in [f for f in zip(*ab_all)]:
|
|
162 ab.append(sum([float(ll) for ll in l]))
|
|
163 ff[fc] = ab
|
|
164 added = True
|
|
165 if added:
|
|
166 break
|
|
167
|
|
168 return ff
|
|
169
|
|
170
|
|
171 def add_missing_levels(ff):
|
|
172 if sum( [f.count(".") for f in ff] ) < 1: return ff
|
|
173
|
|
174 clades2leaves = {}
|
|
175 for f in ff:
|
|
176 fs = f.split(".")
|
|
177 if len(fs) < 2:
|
|
178 continue
|
|
179 for l in range(len(fs)):
|
|
180 n = ".".join( fs[:l] )
|
|
181 if n in clades2leaves:
|
|
182 clades2leaves[n].append( f )
|
|
183 else:
|
|
184 clades2leaves[n] = [f]
|
|
185 for k,v in clades2leaves.items():
|
|
186 if k and k not in ff:
|
|
187 ff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])]
|
|
188 return ff
|
|
189
|
|
190
|
|
191
|
|
192 def modify_feature_names(fn):
|
|
193 ret = fn
|
|
194
|
|
195 for v in [' ',r'\$',r'\@',r'#',r'%',r'\^',r'\&',r'\*',r'\"',r'\'']:
|
|
196 ret = [re.sub(v,"",f) for f in ret]
|
|
197 for v in ["/",r'\(',r'\)',r'-',r'\+',r'=',r'{',r'}',r'\[',r'\]',
|
|
198 r',',r'\.',r';',r':',r'\?',r'\<',r'\>',r'\.',r'\,']:
|
|
199 ret = [re.sub(v,"_",f) for f in ret]
|
|
200 for v in ["\|"]:
|
|
201 ret = [re.sub(v,".",f) for f in ret]
|
|
202
|
|
203 ret2 = []
|
|
204 for r in ret:
|
|
205 if r[0] in ['0','1','2','3','4','5','6','7','8','9']:
|
|
206 ret2.append("f_"+r)
|
|
207 else: ret2.append(r)
|
|
208
|
|
209 return ret2
|
|
210
|
|
211
|
|
212 def rename_same_subcl(cl,subcl):
|
|
213 toc = []
|
|
214 for sc in set(subcl):
|
|
215 if len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1:
|
|
216 toc.append(sc)
|
|
217 new_subcl = []
|
|
218 for i,sc in enumerate(subcl):
|
|
219 if sc in toc: new_subcl.append(cl[i]+"_"+sc)
|
|
220 else: new_subcl.append(sc)
|
|
221 return new_subcl
|
|
222
|
|
223 if __name__ == '__main__':
|
|
224 params = read_params(sys.argv)
|
|
225
|
|
226 if type(params['subclass']) is int and int(params['subclass']) < 1:
|
|
227 params['subclass'] = None
|
|
228 if type(params['subject']) is int and int(params['subject']) < 1:
|
|
229 params['subject'] = None
|
|
230 data = read_input_file(sys.argv[1])
|
|
231
|
|
232 if params['feats_dir'] == "c":
|
|
233 data = transpose(data)
|
|
234
|
|
235 ncl = 1
|
|
236 if not params['subclass'] is None: ncl += 1
|
|
237 if not params['subject'] is None: ncl += 1
|
|
238
|
|
239 first_line = zip(*data)[0]
|
|
240
|
|
241 first_line = modify_feature_names(list(first_line))
|
|
242
|
|
243 data = zip( first_line,
|
|
244 *sort_by_cl(zip(*data)[1:],
|
|
245 ncl,
|
|
246 params['class']-1,
|
|
247 params['subclass']-1 if not params['subclass'] is None else None,
|
|
248 params['subject']-1 if not params['subject'] is None else None))
|
|
249 # data.insert(0,first_line)
|
|
250 # data = remove_missing(data,params['missing_p'])
|
|
251 cls = {}
|
|
252
|
|
253 cls_i = [('class',params['class']-1)]
|
|
254 if params['subclass'] > 0: cls_i.append(('subclass',params['subclass']-1))
|
|
255 if params['subject'] > 0: cls_i.append(('subject',params['subject']-1))
|
|
256 cls_i.sort(lambda x, y: -cmp(x[1],y[1]))
|
|
257 for v in cls_i: cls[v[0]] = data.pop(v[1])[1:]
|
|
258 if not params['subclass'] > 0: cls['subclass'] = [str(cl)+"_subcl" for cl in cls['class']]
|
|
259
|
|
260 cls['subclass'] = rename_same_subcl(cls['class'],cls['subclass'])
|
|
261 # if 'subclass' in cls.keys(): cls = group_small_subclasses(cls,params['subcl_min_card'])
|
|
262 class_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))
|
|
263
|
|
264 feats = dict([(d[0],d[1:]) for d in data])
|
|
265
|
|
266 feats = add_missing_levels(feats)
|
|
267
|
|
268 feats = numerical_values(feats,params['norm_v'])
|
|
269 out = {}
|
|
270 out['feats'] = feats
|
|
271 out['norm'] = params['norm_v']
|
|
272 out['cls'] = cls
|
|
273 out['class_sl'] = class_sl
|
|
274 out['subclass_sl'] = subclass_sl
|
|
275 out['class_hierarchy'] = class_hierarchy
|
|
276
|
|
277 if params['output_table']:
|
|
278 with open( params['output_table'], "w") as outf:
|
|
279 if 'class' in cls: outf.write( "\t".join(list(["class"])+list(cls['class'])) + "\n" )
|
|
280 if 'subclass' in cls: outf.write( "\t".join(list(["subclass"])+list(cls['subclass'])) + "\n" )
|
|
281 if 'subject' in cls: outf.write( "\t".join(list(["subject"])+list(cls['subject'])) + "\n" )
|
|
282 for k,v in out['feats'].items(): outf.write( "\t".join([k]+[str(vv) for vv in v]) + "\n" )
|
|
283
|
|
284 with open(params['output_file'], 'wb') as back_file:
|
|
285 pickle.dump(out,back_file)
|
|
286
|