comparison ete_lineage_generator.py @ 2:03c10736e497 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
author earlhaminst
date Tue, 07 Nov 2017 11:45:13 -0500
parents
children 87b6de3ef63e
comparison
equal deleted inserted replaced
1:a4ba317fc713 2:03c10736e497
1 import optparse
2 import sys
3
4 from ete3 import NCBITaxa
5
6 # - compared to gi2taxonomy the root is excluded, since
7 # the value is always "root", i.e. useless information
8 # - additional levels that appear in the ncbi taxdb have
9 # been added
10 # (order from https://en.wikipedia.org/wiki/Taxonomic_rank#All_ranks)
11 # TODO the full list of ranks could be derived from the input DB
12 LONG_RANKS = [u"superkingdom", u"kingdom", u"subkingdom",
13 u"superphylum", u"phylum", u"subphylum",
14 u"superclass", u"class", u"subclass", "infraclass",
15 u"cohort",
16 u"superorder", u"order", u"suborder", u"infraorder", u"parvorder",
17 u"superfamily", u"family", u"subfamily",
18 u"tribe", u"subtribe",
19 u"genus", u"subgenus",
20 u"species group", u"species subgroup", u"species", u"subspecies",
21 u"varietas", "forma"]
22
23 SHORT_RANKS = [u"kingdom",
24 u"phylum",
25 u"class",
26 u"order",
27 u"family",
28 u"genus",
29 u"species"]
30
31
32 def process_taxid(ncbi, taxid, ranks, RANK_IDX, lower=False):
33 """
34 process one taxid:
35 - get lineage (as list of taxids, ranks, and names)
36 - reverse the lineage if lower ranks are to be used for filling
37 - fill the ranks with the data from the lineage
38 ncbi: ete NCBITaxa object
39 taxid: a taxid (int)
40 ranks: list of ranks (should be initialized with "NA" x number of levels of interest)
41 RANK_IDX: mapping from rank names to indices (distance to root/leaf?)
42 lower: use lower taxa for filling "NA"s
43 """
44 lineage = ncbi.get_lineage(taxid)
45 lineage_ranks = ncbi.get_rank(lineage)
46 lineage_names = ncbi.get_taxid_translator(lineage, try_synonyms=True)
47 if lower:
48 lineage.reverse()
49 for l in lineage:
50 if not lineage_ranks[l] in RANK_IDX:
51 continue
52 if ranks[RANK_IDX[lineage_ranks[l]]] != "NA":
53 continue
54 ranks[RANK_IDX[lineage_ranks[l]]] = lineage_names[l]
55
56
57 # get command line options
58 parser = optparse.OptionParser()
59 parser.add_option('-s', '--species', dest="input_species_filename",
60 help='Species/taxid list in text format one species in each line')
61 parser.add_option('-d', '--database', dest="database", default=None,
62 help='ETE sqlite data base to use (default: ~/.etetoolkit/taxa.sqlite)')
63 parser.add_option('-o', '--output', dest="output", help='output file name (default: stdout)')
64 parser.add_option('-f', dest="full", action="store_true", default=False,
65 help='Show all available (named) taxonomic ranks (default: only primary levels)')
66 parser.add_option('-c', dest="compress", action="store_true", default=False,
67 help='Fill unnamed ranks with super/sub ranks (see -l)')
68 parser.add_option('-l', dest="lower", action="store_true", default=False,
69 help='Prefer lower levels when compressed')
70 parser.add_option('-r', '--rank', dest='ranks', action="append",
71 help='include rank - multiple ones can be specified')
72
73 options, args = parser.parse_args()
74 # check command line options
75 if options.input_species_filename is None:
76 parser.error("-s option must be specified, Species list in text format one species in each line")
77 if options.full and options.ranks:
78 parser.error("-f and -r can not be used at the same time")
79 if options.ranks:
80 for r in options.ranks:
81 if r not in LONG_RANKS:
82 parser.error("unknown rank %s" % r)
83 # setup output
84 if not options.output: # if filename is not given
85 of = sys.stdout
86 else:
87 of = open(options.output, "w")
88 # load NCBI taxonomy DB
89 ncbi = NCBITaxa(dbfile=options.database)
90 # get list of ranks that are of interest
91 if options.ranks:
92 RANKS = []
93 for r in LONG_RANKS:
94 if r in options.ranks:
95 RANKS.append(r)
96 else:
97 if options.full:
98 RANKS = LONG_RANKS
99 else:
100 RANKS = SHORT_RANKS
101 RANK_IDX = {item: index for index, item in enumerate(RANKS)}
102 COMP_RANK_IDX = RANK_IDX
103 if options.compress:
104 for ir in range(len(RANKS)):
105 for ilr in range(len(LONG_RANKS)):
106 if RANKS[ir] in LONG_RANKS[ilr]:
107 COMP_RANK_IDX[LONG_RANKS[ilr]] = ir
108 with open(options.input_species_filename) as f:
109 for line in f.readlines():
110 line = line.strip().replace('_', ' ')
111 try:
112 taxid = int(line)
113 except ValueError:
114 # TODO: one could use fuzzy name lookup (i.e. accept typos in the species names),
115 # but then a pysqlite version that supports this is needed (needs to be enabled
116 # during compilation)
117 name2tax = ncbi.get_name_translator([line])
118 if line in name2tax:
119 taxid = name2tax[line][0]
120 else:
121 sys.stderr.write("[%s] could not be translated into a taxid!\n" % line)
122 continue
123 ranks = ["NA"] * len(RANKS)
124 process_taxid(ncbi, taxid, ranks, RANK_IDX)
125 if options.compress:
126 process_taxid(ncbi, taxid, ranks, COMP_RANK_IDX, options.lower)
127 of.write("%s\t%s\n" % (line, "\t".join(ranks)))
128 of.close()