view home/ubuntu/lefse_to_export/qiime2lefse.py @ 1:db64b6287cd6 draft

Modified datatypes
author george-weingart
date Wed, 20 Aug 2014 16:56:51 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

import sys

def read_params(args):
    import argparse as ap
    import textwrap

    p = ap.ArgumentParser( description= "TBA" )
    
    p.add_argument( '--in', metavar='INPUT_FILE', type=str, 
                    nargs='?', default=sys.stdin,
                    help=   "the Qiime OTU table file "
                            "[ stdin if not present ]" )
    p.add_argument( '--md', metavar='METADATA_FILE', type=str, 
                    nargs='?', default=None,
                    help=   "the Qiime OTU table file " 
                            "[ only OTU table without metadata if not present ]" )
    p.add_argument( '--out', metavar='OUTPUT_FILE', type=str, 
                    nargs = '?', default=sys.stdout,
                    help=   "the output file "
                            "[stdout if not present]")

    p.add_argument( '-c', metavar="class attribute", 
                    type=str,
                    help =  "the attribute to use as class"   )
    p.add_argument( '-s', metavar="subclass attribute", 
                    type=str,
                    help =  "the attribute to use as subclass"   )
    p.add_argument( '-u', metavar="subject attribute", 
                    type=str,
                    help =  "the attribute to use as subject"   )



    return vars(p.parse_args()) 



def qiime2lefse(  fin, fmd, fout, all_md, sel_md ):
    with (fin if fin==sys.stdin else open(fin)) as inpf :
        lines = [list(ll) for ll in 
                    (zip(*[l.strip().split('\t') 
                        for l in inpf.readlines()[1:]]) ) ]
    for i,(l1,l2) in enumerate(zip( lines[0], lines[-1] )):
        if not l2 == 'Consensus Lineage':
            lines[-1][i] = l2+"|"+l1

    data = dict([(l[0],l[1:]) for l in lines[1:]])
    
    md = {}
    if fmd:
        with open(fmd) as inpf:
            mdlines = [l.strip().split('\t') for l in inpf.readlines()]
  
        mdf = mdlines[0][1:]

        for l in mdlines:
            mdd = dict(zip(mdf,l[1:]))
            md[l[0]] = mdd

    selected_md = md.values()[0].keys() if md else []

    if not all_md:
        selected_md = [s for s in sel_md if s]
    
    out_m = [   selected_md + 
                list([d.replace(";","|").replace("\"","") for d in data[ 'Consensus Lineage' ]])    ]
    for k,v in data.items():
        if k == 'Consensus Lineage':
            continue
        out_m.append( [md[k][kmd] for kmd in selected_md] + list(v) )

    with (fout if fout == sys.stdout else open( fout, "w" )) as outf:
        for l in zip(*out_m):
            outf.write( "\t".join(l) + "\n" )

if __name__ == '__main__':
    pars = read_params( sys.argv )
  
    qiime2lefse(   fin     = pars['in'],
                   fmd     = pars['md'],
                   fout    = pars['out'],
                   all_md  = not pars['c'] and not pars['s'] and not pars['u'],
                   sel_md  = [pars['c'],pars['s'],pars['u']])