comparison format_input.py @ 2:a31c10fe09c8 draft default tip

Fixed bug due to numerical approximation after normalization affecting root-level clades (e.g. "Bacteria" or "Archaea")
author george-weingart
date Tue, 07 Jul 2015 13:52:29 -0400
parents
children
comparison
equal deleted inserted replaced
1:db64b6287cd6 2:a31c10fe09c8
1 #!/usr/bin/env python
2
3 import sys,os,argparse,pickle,re,numpy
4
5
6
7
8 #***************************************************************************************************************
9 #* Log of change *
10 #* January 16, 2014 - George Weingart - george.weingart@gmail.com *
11 #* *
12 #* biom Support *
13 #* Modified the program to enable it to accept biom files as input *
14 #* *
15 #* Added two optional input parameters: *
16 #* 1. biom_c is the name of the biom metadata to be used as class *
17 #* 2. biom_s is the name of the biom metadata to be used as subclass *
18 #* class and subclass are used in the same context as the original *
19 #* parameters class and subclass *
20 #* These parameters are totally optional, the default is the program *
21 #* chooses as class the first metadata received from the conversion *
22 #* of the biom file into a sequential (pcl) file as generated by *
23 #* breadcrumbs, and similarly, the second metadata is selected as *
24 #* subclass. *
25 #* The syntax or logic for the original non-biom case was NOT changed. *
26 #* *
27 #* <******************* IMPORTANT NOTE *************************> *
28 #* The biom case requires breadcrumbs and therefore there is a *
29 #* a conditional import of the breadcrumbs modules *
30 #* If the User uses a biom input and breadcrumbs is not detected, *
31 #* the run is abnormally ended *
32 #* breadcrumbs itself needs a biom environment, so if the immport *
33 #* of biom in breadcrumbs fails, the run is also abnormally
34 #* ended (Only if the input file was biom) *
35 #* *
36 #* USAGE EXAMPLES *
37 #* -------------- *
38 #* Case #1: Using a sequential file as input (Old version - did not change *
39 #* ./format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000 *
40 #* Case #2: Using a biom file as input *
41 #* ./format_input.py hmp_aerobiosis_small.biom hmp_aerobiosis_small.in -o 1000000 *
42 #* Case #3: Using a biom file as input and override the class and subclass *
43 #* ./format_input.py lefse.biom hmp_aerobiosis_small.in -biom_c oxygen_availability -biom_s body_site -o 1000000
44 #* *
45 #***************************************************************************************************************
46
47 def read_input_file(inp_file, CommonArea):
48
49 if inp_file.endswith('.biom'): #* If the file format is biom:
50 CommonArea = biom_processing(inp_file) #* Process in biom format
51 return CommonArea #* And return the CommonArea
52
53 with open(inp_file) as inp:
54 CommonArea['ReturnedData'] = [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
55 return CommonArea
56
57 def transpose(data):
58 return zip(*data)
59
60 def read_params(args):
61 parser = argparse.ArgumentParser(description='LEfSe formatting modules')
62 parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.")
63 parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
64 help="the output file containing the data for LEfSe")
65 parser.add_argument('--output_table', type=str, required=False, default="",
66 help="the formatted table in txt format")
67 parser.add_argument('-f',dest="feats_dir", choices=["c","r"], type=str, default="r",
68 help="set whether the features are on rows (default) or on columns")
69 parser.add_argument('-c',dest="class", metavar="[1..n_feats]", type=int, default=1,
70 help="set which feature use as class (default 1)")
71 parser.add_argument('-s',dest="subclass", metavar="[1..n_feats]", type=int, default=None,
72 help="set which feature use as subclass (default -1 meaning no subclass)")
73 parser.add_argument('-o',dest="norm_v", metavar="float", type=float, default=-1.0,
74 help="set the normalization value (default -1.0 meaning no normalization)")
75 parser.add_argument('-u',dest="subject", metavar="[1..n_feats]", type=int, default=None,
76 help="set which feature use as subject (default -1 meaning no subject)")
77 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d",
78 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")
79 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10,
80 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")
81
82 parser.add_argument('-biom_c',dest="biom_class", type=str,
83 help="For biom input files: Set which feature use as class ")
84 parser.add_argument('-biom_s',dest="biom_subclass", type=str,
85 help="For biom input files: set which feature use as subclass ")
86
87 args = parser.parse_args()
88
89 return vars(args)
90
91 def remove_missing(data,roc):
92 if roc == "c": data = transpose(data)
93 max_len = max([len(r) for r in data])
94 to_rem = []
95 for i,r in enumerate(data):
96 if len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i)
97 if len(to_rem):
98 for i in to_rem.reverse():
99 data.pop(i)
100 if roc == "c": return transpose(data)
101 return data
102
103
104 def sort_by_cl(data,n,c,s,u):
105 def sort_lines1(a,b):
106 return int(a[c] > b[c])*2-1
107 def sort_lines2u(a,b):
108 if a[c] != b[c]: return int(a[c] > b[c])*2-1
109 return int(a[u] > b[u])*2-1
110 def sort_lines2s(a,b):
111 if a[c] != b[c]: return int(a[c] > b[c])*2-1
112 return int(a[s] > b[s])*2-1
113 def sort_lines3(a,b):
114 if a[c] != b[c]: return int(a[c] > b[c])*2-1
115 if a[s] != b[s]: return int(a[s] > b[s])*2-1
116 return int(a[u] > b[u])*2-1
117 if n == 3: data.sort(sort_lines3)
118 if n == 2:
119 if s is None:
120 data.sort(sort_lines2u)
121 else:
122 data.sort(sort_lines2s)
123 if n == 1: data.sort(sort_lines1)
124 return data
125
126 def group_small_subclasses(cls,min_subcl):
127 last = ""
128 n = 0
129 repl = []
130 dd = [list(cls['class']),list(cls['subclass'])]
131 for d in dd:
132 if d[1] != last:
133 if n < min_subcl and last != "":
134 repl.append(d[1])
135 last = d[1]
136 n = 1
137 for i,d in enumerate(dd):
138 if d[1] in repl: dd[i][1] = "other"
139 dd[i][1] = str(dd[i][0])+"_"+str(dd[i][1])
140 cls['class'] = dd[0]
141 cls['subclass'] = dd[1]
142 return cls
143
144 def get_class_slices(data):
145 previous_class = data[0][0]
146 previous_subclass = data[0][1]
147 subclass_slices = []
148 class_slices = []
149 last_cl = 0
150 last_subcl = 0
151 class_hierarchy = []
152 subcls = []
153 for i,d in enumerate(data):
154 if d[1] != previous_subclass:
155 subclass_slices.append((previous_subclass,(last_subcl,i)))
156 last_subcl = i
157 subcls.append(previous_subclass)
158 if d[0] != previous_class:
159 class_slices.append((previous_class,(last_cl,i)))
160 class_hierarchy.append((previous_class,subcls))
161 subcls = []
162 last_cl = i
163 previous_subclass = d[1]
164 previous_class = d[0]
165 subclass_slices.append((previous_subclass,(last_subcl,i+1)))
166 subcls.append(previous_subclass)
167 class_slices.append((previous_class,(last_cl,i+1)))
168 class_hierarchy.append((previous_class,subcls))
169 return dict(class_slices), dict(subclass_slices), dict(class_hierarchy)
170
171 def numerical_values(feats,norm):
172 mm = []
173 for k,v in feats.items():
174 feats[k] = [float(val) for val in v]
175 if norm < 0.0: return feats
176 tr = zip(*(feats.values()))
177 mul = []
178 fk = feats.keys()
179 hie = True if sum([k.count(".") for k in fk]) > len(fk) else False
180 for i in range(len(feats.values()[0])):
181 if hie: mul.append(sum([t for j,t in enumerate(tr[i]) if fk[j].count(".") < 1 ]))
182 else: mul.append(sum(tr[i]))
183 if hie and sum(mul) == 0:
184 mul = []
185 for i in range(len(feats.values()[0])):
186 mul.append(sum(tr[i]))
187 for i,m in enumerate(mul):
188 if m == 0: mul[i] = 0.0
189 else: mul[i] = float(norm) / m
190 for k,v in feats.items():
191 feats[k] = [val*mul[i] for i,val in enumerate(v)]
192 if numpy.mean(feats[k]) and (numpy.std(feats[k])/numpy.mean(feats[k])) < 1e-10:
193 feats[k] = [ float(round(kv*1e6)/1e6) for kv in feats[k]]
194 return feats
195
196 def add_missing_levels2(ff):
197
198 if sum( [f.count(".") for f in ff] ) < 1: return ff
199
200 dn = {}
201
202 added = True
203 while added:
204 added = False
205 for f in ff:
206 lev = f.count(".")
207 if lev == 0: continue
208 if lev not in dn: dn[lev] = [f]
209 else: dn[lev].append(f)
210 for fn in sorted(dn,reverse=True):
211 for f in dn[fn]:
212 fc = ".".join(f.split('.')[:-1])
213 if fc not in ff:
214 ab_all = [ff[fg] for fg in ff if (fg.count(".") == 0 and fg == fc) or (fg.count(".") > 0 and fc == ".".join(fg.split('.')[:-1]))]
215 ab =[]
216 for l in [f for f in zip(*ab_all)]:
217 ab.append(sum([float(ll) for ll in l]))
218 ff[fc] = ab
219 added = True
220 if added:
221 break
222
223 return ff
224
225
226 def add_missing_levels(ff):
227 if sum( [f.count(".") for f in ff] ) < 1: return ff
228
229 clades2leaves = {}
230 for f in ff:
231 fs = f.split(".")
232 if len(fs) < 2:
233 continue
234 for l in range(len(fs)):
235 n = ".".join( fs[:l] )
236 if n in clades2leaves:
237 clades2leaves[n].append( f )
238 else:
239 clades2leaves[n] = [f]
240 for k,v in clades2leaves.items():
241 if k and k not in ff:
242 ff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])]
243 return ff
244
245
246
247 def modify_feature_names(fn):
248 ret = fn
249
250 for v in [' ',r'\$',r'\@',r'#',r'%',r'\^',r'\&',r'\*',r'\"',r'\'']:
251 ret = [re.sub(v,"",f) for f in ret]
252 for v in ["/",r'\(',r'\)',r'-',r'\+',r'=',r'{',r'}',r'\[',r'\]',
253 r',',r'\.',r';',r':',r'\?',r'\<',r'\>',r'\.',r'\,']:
254 ret = [re.sub(v,"_",f) for f in ret]
255 for v in ["\|"]:
256 ret = [re.sub(v,".",f) for f in ret]
257
258 ret2 = []
259 for r in ret:
260 if r[0] in ['0','1','2','3','4','5','6','7','8','9']:
261 ret2.append("f_"+r)
262 else: ret2.append(r)
263
264 return ret2
265
266
267 def rename_same_subcl(cl,subcl):
268 toc = []
269 for sc in set(subcl):
270 if len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1:
271 toc.append(sc)
272 new_subcl = []
273 for i,sc in enumerate(subcl):
274 if sc in toc: new_subcl.append(cl[i]+"_"+sc)
275 else: new_subcl.append(sc)
276 return new_subcl
277
278
279 #*************************************************************************************
280 #* Modifications by George Weingart, Jan 15, 2014 *
281 #* If the input file is biom: *
282 #* a. Load an AbundanceTable (Using breadcrumbs) *
283 #* b. Create a sequential file from the AbundanceTable (de-facto - pcl) *
284 #* c. Use that file as input to the rest of the program *
285 #* d. Calculate the c,s,and u parameters, either from the values the User entered *
286 #* from the meta data values in the biom file or set up defaults *
287 #* <<<------------- I M P O R T A N T N O T E ------------------->> *
288 #* breadcrumbs src directory must be included in the PYTHONPATH *
289 #* <<<------------- I M P O R T A N T N O T E ------------------->> *
290 #*************************************************************************************
291 def biom_processing(inp_file):
292 CommonArea = dict() #* Set up a dictionary to return
293 CommonArea['abndData'] = AbundanceTable.funcMakeFromFile(inp_file, #* Create AbundanceTable from input biom file
294 cDelimiter = None,
295 sMetadataID = None,
296 sLastMetadataRow = None,
297 sLastMetadata = None,
298 strFormat = None)
299
300 #****************************************************************
301 #* Building the data element here *
302 #****************************************************************
303 ResolvedData = list() #This is the Resolved data that will be returned
304 IDMetadataName = CommonArea['abndData'].funcGetIDMetadataName() #* ID Metadataname
305 IDMetadata = [CommonArea['abndData'].funcGetIDMetadataName()] #* The first Row
306 for IDMetadataEntry in CommonArea['abndData'].funcGetMetadataCopy()[IDMetadataName]: #* Loop on all the metadata values
307 IDMetadata.append(IDMetadataEntry)
308 ResolvedData.append(IDMetadata) #Add the IDMetadata with all its values to the resolved area
309 for key, value in CommonArea['abndData'].funcGetMetadataCopy().iteritems():
310 if key != IDMetadataName:
311 MetadataEntry = list() #* Set it up
312 MetadataEntry.append(key) #* And post it to the area
313 for x in value:
314 MetadataEntry.append(x) #* Append the metadata value name
315 ResolvedData.append(MetadataEntry)
316 for AbundanceDataEntry in CommonArea['abndData'].funcGetAbundanceCopy(): #* The Abundance Data
317 lstAbundanceDataEntry = list(AbundanceDataEntry) #Convert tuple to list
318 ResolvedData.append(lstAbundanceDataEntry) #Append the list to the metadata list
319 CommonArea['ReturnedData'] = ResolvedData #Post the results
320 return CommonArea
321
322
323 #*******************************************************************************
324 #* Check the params and override in the case of biom *
325 #*******************************************************************************
326 def check_params_for_biom_case(params, CommonArea):
327 CommonArea['MetadataNames'] = list() #Metadata names
328 params['original_class'] = params['class'] #Save the original class
329 params['original_subclass'] = params['subclass'] #Save the original subclass
330 params['original_subject'] = params['subject'] #Save the original subclass
331
332
333 TotalMetadataEntriesAndIDInBiomFile = len(CommonArea['abndData'].funcGetMetadataCopy()) # The number of metadata entries
334 for i in range(0,TotalMetadataEntriesAndIDInBiomFile): #* Populate the meta data names table
335 CommonArea['MetadataNames'].append(CommonArea['ReturnedData'][i][0]) #Add the metadata name
336
337
338 #****************************************************
339 #* Setting the params here *
340 #****************************************************
341
342 if TotalMetadataEntriesAndIDInBiomFile > 0: #If there is at least one entry - has to be the subject
343 params['subject'] = 1
344 if TotalMetadataEntriesAndIDInBiomFile == 2: #If there are 2 - The first is the subject and the second has to be the metadata, and that is the class
345 params['class'] = 2
346 if TotalMetadataEntriesAndIDInBiomFile == 3: #If there are 3: Set up default that the second entry is the class and the third is the subclass
347 params['class'] = 2
348 params['subclass'] = 3
349 FlagError = False #Set up error flag
350
351 if not params['biom_class'] is None and not params['biom_subclass'] is None: #Check if the User passed a valid class and subclass
352 if params['biom_class'] in CommonArea['MetadataNames']:
353 params['class'] = CommonArea['MetadataNames'].index(params['biom_class']) +1 #* Set up the index for that metadata
354 else:
355 FlagError = True
356 if params['biom_subclass'] in CommonArea['MetadataNames']:
357 params['subclass'] = CommonArea['MetadataNames'].index(params['biom_subclass']) +1 #* Set up the index for that metadata
358 else:
359 FlagError = True
360 if FlagError == True: #* If the User passed an invalid class
361 print "**Invalid biom class or subclass passed - Using defaults: First metadata=class, Second Metadata=subclass\n"
362 params['class'] = 2
363 params['subclass'] = 3
364 return params
365
366
367
368 if __name__ == '__main__':
369 CommonArea = dict() #Build a Common Area to pass variables in the biom case
370 params = read_params(sys.argv)
371
372 #*************************************************************
373 #* Conditionally import breadcrumbs if file is a biom file *
374 #* If it is and no breadcrumbs found - abnormally exit *
375 #*************************************************************
376 if params['input_file'].endswith('.biom'):
377 try:
378 from lefsebiom.ConstantsBreadCrumbs import *
379 from lefsebiom.AbundanceTable import *
380 except ImportError:
381 sys.stderr.write("************************************************************************************************************ \n")
382 sys.stderr.write("* Error: Breadcrumbs libraries not detected - required to process biom files - run abnormally terminated * \n")
383 sys.stderr.write("************************************************************************************************************ \n")
384 exit(1)
385
386
387 if type(params['subclass']) is int and int(params['subclass']) < 1:
388 params['subclass'] = None
389 if type(params['subject']) is int and int(params['subject']) < 1:
390 params['subject'] = None
391
392
393 CommonArea = read_input_file(sys.argv[1], CommonArea) #Pass The CommonArea to the Read
394 data = CommonArea['ReturnedData'] #Select the data
395
396 if sys.argv[1].endswith('biom'): #* Check if biom:
397 params = check_params_for_biom_case(params, CommonArea) #Check the params for the biom case
398
399 if params['feats_dir'] == "c":
400 data = transpose(data)
401
402 ncl = 1
403 if not params['subclass'] is None: ncl += 1
404 if not params['subject'] is None: ncl += 1
405
406 first_line = zip(*data)[0]
407
408 first_line = modify_feature_names(list(first_line))
409
410 data = zip( first_line,
411 *sort_by_cl(zip(*data)[1:],
412 ncl,
413 params['class']-1,
414 params['subclass']-1 if not params['subclass'] is None else None,
415 params['subject']-1 if not params['subject'] is None else None))
416 # data.insert(0,first_line)
417 # data = remove_missing(data,params['missing_p'])
418 cls = {}
419
420 cls_i = [('class',params['class']-1)]
421 if params['subclass'] > 0: cls_i.append(('subclass',params['subclass']-1))
422 if params['subject'] > 0: cls_i.append(('subject',params['subject']-1))
423 cls_i.sort(lambda x, y: -cmp(x[1],y[1]))
424 for v in cls_i: cls[v[0]] = data.pop(v[1])[1:]
425 if not params['subclass'] > 0: cls['subclass'] = [str(cl)+"_subcl" for cl in cls['class']]
426
427 cls['subclass'] = rename_same_subcl(cls['class'],cls['subclass'])
428 # if 'subclass' in cls.keys(): cls = group_small_subclasses(cls,params['subcl_min_card'])
429 class_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))
430
431 feats = dict([(d[0],d[1:]) for d in data])
432
433 feats = add_missing_levels(feats)
434
435 feats = numerical_values(feats,params['norm_v'])
436 out = {}
437 out['feats'] = feats
438 out['norm'] = params['norm_v']
439 out['cls'] = cls
440 out['class_sl'] = class_sl
441 out['subclass_sl'] = subclass_sl
442 out['class_hierarchy'] = class_hierarchy
443
444 if params['output_table']:
445 with open( params['output_table'], "w") as outf:
446 if 'class' in cls: outf.write( "\t".join(list(["class"])+list(cls['class'])) + "\n" )
447 if 'subclass' in cls: outf.write( "\t".join(list(["subclass"])+list(cls['subclass'])) + "\n" )
448 if 'subject' in cls: outf.write( "\t".join(list(["subject"])+list(cls['subject'])) + "\n" )
449 for k,v in out['feats'].items(): outf.write( "\t".join([k]+[str(vv) for vv in v]) + "\n" )
450
451 with open(params['output_file'], 'wb') as back_file:
452 pickle.dump(out,back_file)
453