annotate home/ubuntu/lefse_to_export/qiime2lefse.py @ 2:a31c10fe09c8 draft default tip

Fixed bug due to numerical approximation after normalization affecting root-level clades (e.g. "Bacteria" or "Archaea")
author george-weingart
date Tue, 07 Jul 2015 13:52:29 -0400
parents db64b6287cd6
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
1 #!/usr/bin/env python
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
2
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
3 import sys
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
4
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
5 def read_params(args):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
6 import argparse as ap
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
7 import textwrap
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
8
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
9 p = ap.ArgumentParser( description= "TBA" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
10
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
11 p.add_argument( '--in', metavar='INPUT_FILE', type=str,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
12 nargs='?', default=sys.stdin,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
13 help= "the Qiime OTU table file "
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
14 "[ stdin if not present ]" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
15 p.add_argument( '--md', metavar='METADATA_FILE', type=str,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
16 nargs='?', default=None,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
17 help= "the Qiime OTU table file "
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
18 "[ only OTU table without metadata if not present ]" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
19 p.add_argument( '--out', metavar='OUTPUT_FILE', type=str,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
20 nargs = '?', default=sys.stdout,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
21 help= "the output file "
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
22 "[stdout if not present]")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
23
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
24 p.add_argument( '-c', metavar="class attribute",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
25 type=str,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
26 help = "the attribute to use as class" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
27 p.add_argument( '-s', metavar="subclass attribute",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
28 type=str,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
29 help = "the attribute to use as subclass" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
30 p.add_argument( '-u', metavar="subject attribute",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
31 type=str,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
32 help = "the attribute to use as subject" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
33
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
34
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
35
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
36 return vars(p.parse_args())
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
37
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
38
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
39
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
40 def qiime2lefse( fin, fmd, fout, all_md, sel_md ):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
41 with (fin if fin==sys.stdin else open(fin)) as inpf :
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
42 lines = [list(ll) for ll in
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
43 (zip(*[l.strip().split('\t')
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
44 for l in inpf.readlines()[1:]]) ) ]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
45 for i,(l1,l2) in enumerate(zip( lines[0], lines[-1] )):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
46 if not l2 == 'Consensus Lineage':
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
47 lines[-1][i] = l2+"|"+l1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
48
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
49 data = dict([(l[0],l[1:]) for l in lines[1:]])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
50
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
51 md = {}
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
52 if fmd:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
53 with open(fmd) as inpf:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
54 mdlines = [l.strip().split('\t') for l in inpf.readlines()]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
55
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
56 mdf = mdlines[0][1:]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
57
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
58 for l in mdlines:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
59 mdd = dict(zip(mdf,l[1:]))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
60 md[l[0]] = mdd
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
61
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
62 selected_md = md.values()[0].keys() if md else []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
63
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
64 if not all_md:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
65 selected_md = [s for s in sel_md if s]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
66
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
67 out_m = [ selected_md +
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
68 list([d.replace(";","|").replace("\"","") for d in data[ 'Consensus Lineage' ]]) ]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
69 for k,v in data.items():
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
70 if k == 'Consensus Lineage':
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
71 continue
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
72 out_m.append( [md[k][kmd] for kmd in selected_md] + list(v) )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
73
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
74 with (fout if fout == sys.stdout else open( fout, "w" )) as outf:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
75 for l in zip(*out_m):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
76 outf.write( "\t".join(l) + "\n" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
77
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
78 if __name__ == '__main__':
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
79 pars = read_params( sys.argv )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
80
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
81 qiime2lefse( fin = pars['in'],
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
82 fmd = pars['md'],
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
83 fout = pars['out'],
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
84 all_md = not pars['c'] and not pars['s'] and not pars['u'],
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
85 sel_md = [pars['c'],pars['s'],pars['u']])