annotate ete_genetree_splitter.py @ 8:16e925bf567e draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
author earlhaminst
date Thu, 31 Oct 2019 07:48:59 -0400
parents 6a5282f71f82
children b29ee6a16524
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
1 from __future__ import print_function
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
2
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
3 import optparse
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
4
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
5 from ete3 import PhyloTree
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
6
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
7
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
8 def main():
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
9 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]"
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
10 parser = optparse.OptionParser(usage=usage)
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
11 parser.add_option('--genetree', help='GeneTree in nhx format')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
12 parser.add_option('--speciestree', help='Species Tree in nhx format')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
13 parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
14 parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
15 parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
16 parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
17 options, args = parser.parse_args()
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
18
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
19 if options.genetree is None:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
20 parser.error("--genetree option must be specified, GeneTree in nhx format")
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
21
7
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
22 with open(options.genetree, 'r') as f:
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
23 contents = f.read()
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
24
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
25 # Remove empty NHX features that can be produced by TreeBest but break ete3
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
26 contents = contents.replace('[&&NHX]', '')
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
27
3
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
28 # reads single gene tree
7
6a5282f71f82 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 55116304ad98416757ad689c9a885dd0967f120e
earlhaminst
parents: 3
diff changeset
29 genetree = PhyloTree(contents)
3
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
30
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
31 # sets species naming function
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
32 if options.gene_node == 0:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
33 genetree.set_species_naming_function(parse_sp_name)
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
34
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
35 # reconcile species tree with gene tree to help find out gene gain/lose
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
36 if options.gainlose:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
37
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
38 if options.speciestree is None:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
39 parser.error("--speciestree option must be specified, species tree in nhx format")
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
40
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
41 # reads species tree
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
42 speciestree = PhyloTree(options.speciestree, format=options.species_format)
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
43
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
44 # Removes '*' from Species names comes from Species tree configrured for TreeBest
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
45 for leaf in speciestree:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
46 leaf.name = leaf.name.strip('*')
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
47
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
48 genetree, events = genetree.reconcile(speciestree)
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
49
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
50 # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes.
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
51 for cluster_id, node in enumerate(genetree.split_by_dups(), 1):
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
52 outfile = str(cluster_id) + '_genetree.nhx'
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
53 with open(outfile, 'w') as f:
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
54 f.write(node.write(format=options.output_format))
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
55
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
56
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
57 def parse_sp_name(node_name):
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
58 return node_name.split("_")[1]
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
59
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
60
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
61 if __name__ == "__main__":
077021c45b96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit a22e605b871c2185e98d89598aebb2fa3a82bb8f
earlhaminst
parents:
diff changeset
62 main()