Mercurial > repos > earlhaminst > ete
diff ete_lineage_generator.py @ 2:03c10736e497 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
author | earlhaminst |
---|---|
date | Tue, 07 Nov 2017 11:45:13 -0500 |
parents | |
children | 87b6de3ef63e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ete_lineage_generator.py Tue Nov 07 11:45:13 2017 -0500 @@ -0,0 +1,128 @@ +import optparse +import sys + +from ete3 import NCBITaxa + +# - compared to gi2taxonomy the root is excluded, since +# the value is always "root", i.e. useless information +# - additional levels that appear in the ncbi taxdb have +# been added +# (order from https://en.wikipedia.org/wiki/Taxonomic_rank#All_ranks) +# TODO the full list of ranks could be derived from the input DB +LONG_RANKS = [u"superkingdom", u"kingdom", u"subkingdom", + u"superphylum", u"phylum", u"subphylum", + u"superclass", u"class", u"subclass", "infraclass", + u"cohort", + u"superorder", u"order", u"suborder", u"infraorder", u"parvorder", + u"superfamily", u"family", u"subfamily", + u"tribe", u"subtribe", + u"genus", u"subgenus", + u"species group", u"species subgroup", u"species", u"subspecies", + u"varietas", "forma"] + +SHORT_RANKS = [u"kingdom", + u"phylum", + u"class", + u"order", + u"family", + u"genus", + u"species"] + + +def process_taxid(ncbi, taxid, ranks, RANK_IDX, lower=False): + """ + process one taxid: + - get lineage (as list of taxids, ranks, and names) + - reverse the lineage if lower ranks are to be used for filling + - fill the ranks with the data from the lineage + ncbi: ete NCBITaxa object + taxid: a taxid (int) + ranks: list of ranks (should be initialized with "NA" x number of levels of interest) + RANK_IDX: mapping from rank names to indices (distance to root/leaf?) + lower: use lower taxa for filling "NA"s + """ + lineage = ncbi.get_lineage(taxid) + lineage_ranks = ncbi.get_rank(lineage) + lineage_names = ncbi.get_taxid_translator(lineage, try_synonyms=True) + if lower: + lineage.reverse() + for l in lineage: + if not lineage_ranks[l] in RANK_IDX: + continue + if ranks[RANK_IDX[lineage_ranks[l]]] != "NA": + continue + ranks[RANK_IDX[lineage_ranks[l]]] = lineage_names[l] + + +# get command line options +parser = optparse.OptionParser() +parser.add_option('-s', '--species', dest="input_species_filename", + help='Species/taxid list in text format one species in each line') +parser.add_option('-d', '--database', dest="database", default=None, + help='ETE sqlite data base to use (default: ~/.etetoolkit/taxa.sqlite)') +parser.add_option('-o', '--output', dest="output", help='output file name (default: stdout)') +parser.add_option('-f', dest="full", action="store_true", default=False, + help='Show all available (named) taxonomic ranks (default: only primary levels)') +parser.add_option('-c', dest="compress", action="store_true", default=False, + help='Fill unnamed ranks with super/sub ranks (see -l)') +parser.add_option('-l', dest="lower", action="store_true", default=False, + help='Prefer lower levels when compressed') +parser.add_option('-r', '--rank', dest='ranks', action="append", + help='include rank - multiple ones can be specified') + +options, args = parser.parse_args() +# check command line options +if options.input_species_filename is None: + parser.error("-s option must be specified, Species list in text format one species in each line") +if options.full and options.ranks: + parser.error("-f and -r can not be used at the same time") +if options.ranks: + for r in options.ranks: + if r not in LONG_RANKS: + parser.error("unknown rank %s" % r) +# setup output +if not options.output: # if filename is not given + of = sys.stdout +else: + of = open(options.output, "w") +# load NCBI taxonomy DB +ncbi = NCBITaxa(dbfile=options.database) +# get list of ranks that are of interest +if options.ranks: + RANKS = [] + for r in LONG_RANKS: + if r in options.ranks: + RANKS.append(r) +else: + if options.full: + RANKS = LONG_RANKS + else: + RANKS = SHORT_RANKS +RANK_IDX = {item: index for index, item in enumerate(RANKS)} +COMP_RANK_IDX = RANK_IDX +if options.compress: + for ir in range(len(RANKS)): + for ilr in range(len(LONG_RANKS)): + if RANKS[ir] in LONG_RANKS[ilr]: + COMP_RANK_IDX[LONG_RANKS[ilr]] = ir +with open(options.input_species_filename) as f: + for line in f.readlines(): + line = line.strip().replace('_', ' ') + try: + taxid = int(line) + except ValueError: + # TODO: one could use fuzzy name lookup (i.e. accept typos in the species names), + # but then a pysqlite version that supports this is needed (needs to be enabled + # during compilation) + name2tax = ncbi.get_name_translator([line]) + if line in name2tax: + taxid = name2tax[line][0] + else: + sys.stderr.write("[%s] could not be translated into a taxid!\n" % line) + continue + ranks = ["NA"] * len(RANKS) + process_taxid(ncbi, taxid, ranks, RANK_IDX) + if options.compress: + process_taxid(ncbi, taxid, ranks, COMP_RANK_IDX, options.lower) + of.write("%s\t%s\n" % (line, "\t".join(ranks))) +of.close()