#!/usr/bin/env python

import sys
import csv
import argparse as ap
from ete3 import NCBITaxa

parser = ap.ArgumentParser(prog='outlier-parser', conflict_handler='resolve',
                           description="Parses output file to eliminate outliers")


input = parser.add_argument_group('Input', '')
input.add_argument('db', help="sqlite formatted ETE3 taxa database")
input.add_argument('taxid', metavar="INT", help='Target taxonomic ID')
input.add_argument('input', nargs='+', help="Tab-delimited RefSeq Masher reports")

if len(sys.argv) == 1:
    parser.print_usage()
    sys.exit(1)

args = parser.parse_args()

output = open("outlier_list.txt", "w")
outlier_flag = False
accession = ""

ncbi = NCBITaxa(args.db)
descendants = ncbi.get_descendant_taxa(args.taxid, intermediate_nodes=True)

for report in args.input: 
    with open(report) as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        next(reader, None)

        for row in reader:
            if row['sample'] == accession and outlier_flag:
                continue
            accession = row['sample']

            if (int(row['taxid']) != int(args.taxid) and int(row['taxid']) not in descendants) or float(row['distance']) > 0.05:
                output.write("%s\n" % accession)
                outlier_flag = True
