annotate ete_genetree_splitter.py @ 9:b29ee6a16524 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
author earlhaminst
date Tue, 20 Oct 2020 15:10:40 +0000
parents 6a5282f71f82
children dc32007a6b36
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
1 from __future__ import print_function
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
2
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
3 import optparse
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
4
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
5 from ete3 import PhyloTree
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
6
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
7
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
8 def main():
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
9 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]"
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
10 parser = optparse.OptionParser(usage=usage)
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
11 parser.add_option('--genetree', help='GeneTree in nhx format')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
12 parser.add_option('--speciestree', help='Species Tree in nhx format')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
13 parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
14 parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
15 parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose')
9
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
16 parser.add_option('--split', type='choice', choices=['dups', 'treeko'], dest="split", default='dups', help='Choose GeneTree splitting algorithms')
3
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
17 parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
18 options, args = parser.parse_args()
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
19
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
20 if options.genetree is None:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
21 parser.error("--genetree option must be specified, GeneTree in nhx format")
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
22
7
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
23 with open(options.genetree, 'r') as f:
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
24 contents = f.read()
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
25
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
26 # Remove empty NHX features that can be produced by TreeBest but break ete3
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
27 contents = contents.replace('[&&NHX]', '')
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
28
3
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
29 # reads single gene tree
7
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
30 genetree = PhyloTree(contents)
3
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
31
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
32 # sets species naming function
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
33 if options.gene_node == 0:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
34 genetree.set_species_naming_function(parse_sp_name)
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
35
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
36 # reconcile species tree with gene tree to help find out gene gain/lose
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
37 if options.gainlose:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
38
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
39 if options.speciestree is None:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
40 parser.error("--speciestree option must be specified, species tree in nhx format")
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
41
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
42 # reads species tree
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
43 speciestree = PhyloTree(options.speciestree, format=options.species_format)
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
44
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
45 # Removes '*' from Species names comes from Species tree configrured for TreeBest
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
46 for leaf in speciestree:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
47 leaf.name = leaf.name.strip('*')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
48
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
49 genetree, events = genetree.reconcile(speciestree)
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
50
9
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
51 if options.split == "dups":
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
52 # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes.
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
53 for cluster_id, node in enumerate(genetree.split_by_dups(), 1):
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
54 outfile = str(cluster_id) + '_genetree.nhx'
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
55 with open(outfile, 'w') as f:
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
56 f.write(node.write(format=options.output_format))
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
57 elif options.split == "treeko":
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
58 # splits tree using the TreeKO algorithm.
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
59 ntrees, ndups, sptrees = genetree.get_speciation_trees()
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
60
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
61 cluster_id = 0
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
62 for spt in sptrees:
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
63 cluster_id = cluster_id + 1
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
64 outfile = str(cluster_id) + '_genetree.nhx'
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
65 with open(outfile, 'w') as f:
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 7
diff changeset
66 f.write(spt.write(format=options.output_format))
3
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
67
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
68
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
69 def parse_sp_name(node_name):
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
70 return node_name.split("_")[1]
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
71
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
72
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
73 if __name__ == "__main__":
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
74 main()