Repository 'ensembl_get_sequences'
hg clone https://toolshed.g2.bx.psu.edu/repos/earlhaminst/ensembl_get_sequences

Changeset 1:e5dd4bd78bbc (2016-12-12)
Previous changeset 0:76b2c482f1e8 (2016-08-11) Next changeset 2:4b7261f484bb (2016-12-21)
Commit message:
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/Ensembl-REST commit aaf8d501c3a92ed415fdf9293a65468c72aae984-dirty
modified:
test-data/genetree.json
test-data/genetree.phyloxml
added:
get_feature_info.py
get_genetree.py
get_sequences.py
get_sequences.xml
removed:
get_feature_info/get_feature_info.py
get_genetree/get_genetree.py
get_sequences/get_sequences.py
get_sequences/get_sequences.xml
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc get_feature_info.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_feature_info.py Mon Dec 12 07:47:42 2016 -0500
[
@@ -0,0 +1,41 @@
+# A simple tool to connect to the Ensembl server and retrieve feature
+# information using the Ensembl REST API.
+import json
+import optparse
+from urlparse import urljoin
+
+import requests
+
+parser = optparse.OptionParser()
+parser.add_option('-i', '--input', help='List of Ensembl IDs')
+parser.add_option('-e', '--expand', type='choice', choices=['0', '1'],
+                  default='0',
+                  help='Expands the search to include any connected features. e.g. If the object is a gene, its transcripts, translations and exons will be returned as well.')
+
+parser.add_option('-s', '--species', type='choice',
+                  choices=['ensembl', 'ensemblgenomes'], default='ensembl',
+                  help='Specify the genome databases for vertebrates and other eukaryotic species')
+
+parser.add_option('-f', '--format', type='choice',
+                  choices=['full', 'condensed'], default='full',
+                  help='Specify the formats to emit from this endpoint')
+options, args = parser.parse_args()
+if options.input is None:
+    raise Exception('-i option must be specified')
+
+
+server = 'http://rest.%s.org' % options.species
+ext = 'lookup/id'
+
+headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
+params = dict((k, getattr(options, k)) for k in ['format', 'expand'])
+with open(options.input) as f:
+    ids = [line.strip() for line in f]
+data = {'ids': ids}
+r = requests.post(urljoin(server, ext), params=params, headers=headers,
+                  data=json.dumps(data))
+
+if not r.ok:
+    r.raise_for_status()
+
+print r.text
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc get_feature_info/get_feature_info.py
--- a/get_feature_info/get_feature_info.py Thu Aug 11 14:29:50 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,41 +0,0 @@
-# A simple tool to connect to the Ensembl server and retrieve feature
-# information using the Ensembl REST API.
-import json
-import optparse
-from urlparse import urljoin
-
-import requests
-
-parser = optparse.OptionParser()
-parser.add_option('-i', '--input', help='List of Ensembl IDs')
-parser.add_option('-e', '--expand', type='choice', choices=['0', '1'],
-                  default='0',
-                  help='Expands the search to include any connected features. e.g. If the object is a gene, its transcripts, translations and exons will be returned as well.')
-
-parser.add_option('-s', '--species', type='choice',
-                  choices=['ensembl', 'ensemblgenomes'], default='ensembl',
-                  help='Specify the genome databases for vertebrates and other eukaryotic species')
-
-parser.add_option('-f', '--format', type='choice',
-                  choices=['full', 'condensed'], default='full',
-                  help='Specify the formats to emit from this endpoint')
-options, args = parser.parse_args()
-if options.input is None:
-    raise Exception('-i option must be specified')
-
-
-server = 'http://rest.%s.org' % options.species
-ext = 'lookup/id'
-
-headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
-params = dict((k, getattr(options, k)) for k in ['format', 'expand'])
-with open(options.input) as f:
-    ids = [line.strip() for line in f]
-data = {'ids': ids}
-r = requests.post(urljoin(server, ext), params=params, headers=headers,
-                  data=json.dumps(data))
-
-if not r.ok:
-    r.raise_for_status()
-
-print r.text
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc get_genetree.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_genetree.py Mon Dec 12 07:47:42 2016 -0500
[
@@ -0,0 +1,58 @@
+# A simple tool to connect to the Ensembl server and retrieve genetree using
+# the Ensembl REST API.
+import optparse
+from urlparse import urljoin
+
+import requests
+
+parser = optparse.OptionParser()
+parser.add_option('--id_type', type='choice', default='gene_id',
+                  choices=['gene_id', 'gene_tree_id'], help='Input type')
+parser.add_option('-i', '--input', help='Ensembl ID')
+parser.add_option('--format', type='choice',
+                  choices=['json', 'orthoxml', 'phyloxml', 'nh'],
+                  default='json', help='Output format')
+parser.add_option('-s', '--sequence', type='choice',
+                  choices=['protein', 'cdna', 'none'], default='protein',
+                  help='The type of sequence to bring back. Setting it to none results in no sequence being returned')
+
+parser.add_option('-g', '--species', type='choice',
+                  choices=['ensembl', 'ensemblgenomes'], default='ensembl',
+                  help='Specify the genome databases for vertebrates and other eukaryotic species')
+
+parser.add_option('-a', '--aligned', type='choice', choices=['0', '1'],
+                  default='0', help='Return the aligned string if true. Otherwise, return the original sequence (no insertions)')
+parser.add_option('-c', '--cigar_line', type='choice', choices=['0', '1'],
+                  default='0',
+                  help='Return the aligned sequence encoded in CIGAR format')
+parser.add_option('--nh_format', type='choice',
+                  choices=['full', 'display_label_composite', 'simple', 'species', 'species_short_name', 'ncbi_taxon', 'ncbi_name', 'njtree', 'phylip'],
+                  default='simple',
+                  help='The format of a NH (New Hampshire) request')
+options, args = parser.parse_args()
+if options.input is None:
+    raise Exception('-i option must be specified')
+
+server = 'http://rest.%s.org' % options.species
+
+if options.id_type == 'gene_id':
+    ext = 'genetree/member/id'
+elif options.id_type == 'gene_tree_id':
+    ext = 'genetree/id'
+
+if options.format == 'json':
+    content_type = 'application/json'
+elif options.format == 'orthoxml':
+    content_type = 'text/x-orthoxml+xml'
+elif options.format == 'phyloxml':
+    content_type = 'text/x-phyloxml+xml'
+elif options.format == 'nh':
+    content_type = 'text/x-nh'
+headers = {'Content-Type': content_type}
+params = dict((k, getattr(options, k)) for k in ['sequence', 'aligned', 'cigar_line', 'nh_format'])
+r = requests.get(urljoin(server, '/'.join([ext, options.input])), params=params, headers=headers)
+
+if not r.ok:
+    r.raise_for_status()
+
+print r.text
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc get_genetree/get_genetree.py
--- a/get_genetree/get_genetree.py Thu Aug 11 14:29:50 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,58 +0,0 @@
-# A simple tool to connect to the Ensembl server and retrieve genetree using
-# the Ensembl REST API.
-import optparse
-from urlparse import urljoin
-
-import requests
-
-parser = optparse.OptionParser()
-parser.add_option('--id_type', type='choice', default='gene_id',
-                  choices=['gene_id', 'gene_tree_id'], help='Input type')
-parser.add_option('-i', '--input', help='Ensembl ID')
-parser.add_option('--format', type='choice',
-                  choices=['json', 'orthoxml', 'phyloxml', 'nh'],
-                  default='json', help='Output format')
-parser.add_option('-s', '--sequence', type='choice',
-                  choices=['protein', 'cdna', 'none'], default='protein',
-                  help='The type of sequence to bring back. Setting it to none results in no sequence being returned')
-
-parser.add_option('-g', '--species', type='choice',
-                  choices=['ensembl', 'ensemblgenomes'], default='ensembl',
-                  help='Specify the genome databases for vertebrates and other eukaryotic species')
-
-parser.add_option('-a', '--aligned', type='choice', choices=['0', '1'],
-                  default='0', help='Return the aligned string if true. Otherwise, return the original sequence (no insertions)')
-parser.add_option('-c', '--cigar_line', type='choice', choices=['0', '1'],
-                  default='0',
-                  help='Return the aligned sequence encoded in CIGAR format')
-parser.add_option('--nh_format', type='choice',
-                  choices=['full', 'display_label_composite', 'simple', 'species', 'species_short_name', 'ncbi_taxon', 'ncbi_name', 'njtree', 'phylip'],
-                  default='simple',
-                  help='The format of a NH (New Hampshire) request')
-options, args = parser.parse_args()
-if options.input is None:
-    raise Exception('-i option must be specified')
-
-server = 'http://rest.%s.org' % options.species
-
-if options.id_type == 'gene_id':
-    ext = 'genetree/member/id'
-elif options.id_type == 'gene_tree_id':
-    ext = 'genetree/id'
-
-if options.format == 'json':
-    content_type = 'application/json'
-elif options.format == 'orthoxml':
-    content_type = 'text/x-orthoxml+xml'
-elif options.format == 'phyloxml':
-    content_type = 'text/x-phyloxml+xml'
-elif options.format == 'nh':
-    content_type = 'text/x-nh'
-headers = {'Content-Type': content_type}
-params = dict((k, getattr(options, k)) for k in ['sequence', 'aligned', 'cigar_line', 'nh_format'])
-r = requests.get(urljoin(server, '/'.join([ext, options.input])), params=params, headers=headers)
-
-if not r.ok:
-    r.raise_for_status()
-
-print r.text
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc get_sequences.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_sequences.py Mon Dec 12 07:47:42 2016 -0500
[
@@ -0,0 +1,46 @@
+# A simple tool to connect to the Ensembl server and retrieve sequences using
+# the Ensembl REST API.
+import json
+import optparse
+from itertools import islice
+from urlparse import urljoin
+
+import requests
+
+parser = optparse.OptionParser()
+parser.add_option('-i', '--input', help='List of Ensembl IDs')
+
+parser.add_option('-s', '--species', type='choice',
+                  choices=['ensembl', 'ensemblgenomes'], default='ensembl',
+                  help='Specify the genome databases for vertebrates and other eukaryotic species')
+
+parser.add_option('-t', '--type', type='choice',
+                  choices=['genomic', 'cds', 'cdna', 'protein'],
+                  default='genomic', help='Type of sequence')
+parser.add_option('--expand_3prime', type='int', default=0,
+                  help='Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type')
+parser.add_option('--expand_5prime', type='int', default=0,
+                  help='Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type')
+options, args = parser.parse_args()
+if options.input is None:
+    raise Exception('-i option must be specified')
+
+server = 'http://rest.%s.org' % options.species
+ext = 'sequence/id'
+
+headers = {'Content-Type': 'text/x-fasta', 'Accept': 'text/x-fasta'}
+params = dict((k, getattr(options, k)) for k in ['type', 'expand_3prime', 'expand_5prime'])
+with open(options.input) as f:
+    # Need to split the file in chunks of 50 lines because of the limit imposed by Ensembl
+    while True:
+        ids = [line.strip() for line in islice(f, 50)]
+        if not ids:
+            break
+        data = {'ids': ids}
+        r = requests.post(urljoin(server, ext), params=params, headers=headers,
+                          data=json.dumps(data))
+
+        if not r.ok:
+            r.raise_for_status()
+
+        print r.text
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc get_sequences.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_sequences.xml Mon Dec 12 07:47:42 2016 -0500
[
@@ -0,0 +1,61 @@
+<tool id="get_sequences" name="Get sequences by Ensembl ID" version="0.1.1">
+    <description>using REST API</description>
+    <requirements>
+        <requirement type="package" version="2.7">requests</requirement>
+    </requirements>
+    <command>
+<![CDATA[
+python $__tool_directory__/get_sequences.py
+-s $species_selector
+--expand_3prime $expand_3prime
+--expand_5prime $expand_5prime
+-t $type_selector
+-i "$input"
+> "$output"
+]]>
+    </command>
+
+    <inputs>
+        <param name="input" type="data" format="txt" label="List of Ensembl IDs" />
+        <param name="species_selector" type="select" label="Select Species">
+            <option value="ensembl" selected="true">Vertebrates</option>
+            <option value="ensemblgenomes">Other species</option>
+        </param>
+        <param name="expand_3prime" type="integer" value="0" min="0" label="expand_3prime" help="Expand each sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type." />
+        <param name="expand_5prime" type="integer" value="0" min="0" label="expand_5prime" help="Expand each sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type." />
+        <param name="type_selector" type="select" label="Type" help="Type of sequence. Defaults to genomic where applicable, i.e. not translations. cDNA refers to the spliced transcript sequence with UTR; CDS refers to the spliced transcript sequence without UTR">
+            <option value="genomic" selected="true">Genomic</option>
+            <option value="cds">CDS</option>
+            <option value="cdna">cDNA</option>
+            <option value="protein">Protein</option>
+        </param>
+    </inputs>
+
+    <outputs>
+        <data name="output" format="fasta" label="$(tool.name) on ${on_string}" />
+    </outputs>
+
+    <tests>
+          <test>
+                <param name="input" ftype="txt" value="input.txt" />
+                <param name="expand_3prime" value="0" />
+                <param name="expand_5prime" value="0" />
+                <param name="type_selector" value="genomic" />
+                <output name="output" file="sequences.fasta" />
+          </test>
+     </tests>
+
+    <help>
+<![CDATA[
+**What it does**
+
+Retrieves FASTA sequences from Ensembl using its REST API.
+
+Uses the `"POST sequence/id"`_ API endpoint.
+
+.. _"POST sequence/id": http://rest.ensembl.org/documentation/info/sequence_id_post
+]]>
+    </help>
+    <citations>
+    </citations>
+</tool>
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc get_sequences/get_sequences.py
--- a/get_sequences/get_sequences.py Thu Aug 11 14:29:50 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,46 +0,0 @@
-# A simple tool to connect to the Ensembl server and retrieve sequences using
-# the Ensembl REST API.
-import json
-import optparse
-from itertools import islice
-from urlparse import urljoin
-
-import requests
-
-parser = optparse.OptionParser()
-parser.add_option('-i', '--input', help='List of Ensembl IDs')
-
-parser.add_option('-s', '--species', type='choice',
-                  choices=['ensembl', 'ensemblgenomes'], default='ensembl',
-                  help='Specify the genome databases for vertebrates and other eukaryotic species')
-
-parser.add_option('-t', '--type', type='choice',
-                  choices=['genomic', 'cds', 'cdna', 'protein'],
-                  default='genomic', help='Type of sequence')
-parser.add_option('--expand_3prime', type='int', default=0,
-                  help='Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type')
-parser.add_option('--expand_5prime', type='int', default=0,
-                  help='Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type')
-options, args = parser.parse_args()
-if options.input is None:
-    raise Exception('-i option must be specified')
-
-server = 'http://rest.%s.org' % options.species
-ext = 'sequence/id'
-
-headers = {'Content-Type': 'text/x-fasta', 'Accept': 'text/x-fasta'}
-params = dict((k, getattr(options, k)) for k in ['type', 'expand_3prime', 'expand_5prime'])
-with open(options.input) as f:
-    # Need to split the file in chunks of 50 lines because of the limit imposed by Ensembl
-    while True:
-        ids = [line.strip() for line in islice(f, 50)]
-        if not ids:
-            break
-        data = {'ids': ids}
-        r = requests.post(urljoin(server, ext), params=params, headers=headers,
-                          data=json.dumps(data))
-
-        if not r.ok:
-            r.raise_for_status()
-
-        print r.text
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc get_sequences/get_sequences.xml
--- a/get_sequences/get_sequences.xml Thu Aug 11 14:29:50 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,61 +0,0 @@
-<tool id="get_sequences" name="Get sequences by Ensembl ID" version="0.1.1">
-    <description>using REST API</description>
-    <requirements>
-        <requirement type="package" version="2.7">requests</requirement>
-    </requirements>
-    <command>
-<![CDATA[
-python $__tool_directory__/get_sequences.py
--s $species_selector
---expand_3prime $expand_3prime
---expand_5prime $expand_5prime
--t $type_selector
--i "$input"
-> "$output"
-]]>
-    </command>
-
-    <inputs>
-        <param name="input" type="data" format="txt" label="List of Ensembl IDs" />
-        <param name="species_selector" type="select" label="Select Species">
-            <option value="ensembl" selected="true">Vertebrates</option>
-            <option value="ensemblgenomes">Other species</option>
-        </param>
-        <param name="expand_3prime" type="integer" value="0" min="0" label="expand_3prime" help="Expand each sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type." />
-        <param name="expand_5prime" type="integer" value="0" min="0" label="expand_5prime" help="Expand each sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type." />
-        <param name="type_selector" type="select" label="Type" help="Type of sequence. Defaults to genomic where applicable, i.e. not translations. cDNA refers to the spliced transcript sequence with UTR; CDS refers to the spliced transcript sequence without UTR">
-            <option value="genomic" selected="true">Genomic</option>
-            <option value="cds">CDS</option>
-            <option value="cdna">cDNA</option>
-            <option value="protein">Protein</option>
-        </param>
-    </inputs>
-
-    <outputs>
-        <data name="output" format="fasta" label="$(tool.name) on ${on_string}" />
-    </outputs>
-
-    <tests>
-          <test>
-                <param name="input" ftype="txt" value="input.txt" />
-                <param name="expand_3prime" value="0" />
-                <param name="expand_5prime" value="0" />
-                <param name="type_selector" value="genomic" />
-                <output name="output" file="sequences.fasta" />
-          </test>
-     </tests>
-
-    <help>
-<![CDATA[
-**What it does**
-
-Retrieves FASTA sequences from Ensembl using its REST API.
-
-Uses the `"POST sequence/id"`_ API endpoint.
-
-.. _"POST sequence/id": http://rest.ensembl.org/documentation/info/sequence_id_post
-]]>
-    </help>
-    <citations>
-    </citations>
-</tool>
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc test-data/genetree.json
--- a/test-data/genetree.json Thu Aug 11 14:29:50 2016 -0400
+++ b/test-data/genetree.json Mon Dec 12 07:47:42 2016 -0500
[
b'@@ -1,1 +1,1 @@\n-{"tree":{"events":{"type":"speciation"},"branch_length":0,"children":[{"events":{"type":"speciation"},"branch_length":0.153275,"children":[{"events":{"type":"speciation"},"branch_length":0.155187,"children":[{"events":{"type":"speciation"},"branch_length":0.133192,"children":[{"events":{"type":"speciation"},"branch_length":0.201095,"children":[{"events":{"type":"speciation"},"branch_length":0.015782,"children":[{"events":{"type":"speciation"},"branch_length":0.217419,"children":[{"sequence":{"mol_seq":{"seq":"QLARDMQDMRIRKKKRQTIRPLPGSLFQKKSSGVARIPFKAAVNGKPPARYTAKPLCGLGVPLNVLEITSETAESFRFSLQHFVKLESLIDKGGIQLADGGWLIPTNDGTAGKEEFYRALCDTPGVDPKLMSEEWVYNHYRWIVWKQASMERSFPEEMGSLCLTPEQVLLQLKYRYDIEVDHSRRPALRKIMEKDDTAAKTLVLCVCGVVFRGSSPKNKSFGDISTPGADPKVENPCAVVWLTDGWYSIKAQLDGPLTSMLHRGRLPVGGKLIIHGAQLVGSENACSPLEAPVSLMLKICANSSRPARWDSKLGFHRDPRPFLLPVSSLYSSGGPVGCVDIIILRSYPILWMERKPEGGTVFRSGRAEEKEARRYNIHKEKAMEILFDKIKAEFEKEEKGNRKPQCRRTINGQNITSLQDGEELYEAVGDDPAFLEAHLTEKQVEVLQNYKRLVMEKQQAELQDRYRRAVESAEDGVGGCPKRDVAPVWRLCIADSMGHSGRVYQLSLWRPPSELQALLKEGCRYKVYNLTTLDSKKQGGNATVQLTATKKTQFEHLQGSEEWLSKHFQPRVATNFVRLQDPEFNPLCSEVDLTGYVITIIDGQGFSPAFYLADGKQNFVKVRCFSSFAQSGLEDVIKPRVLLALSNLQLRGQSTSPTPVVYAGDLTVFSTNPKEVHLQESFSQLKTLVQGQENFFVHAEEKLSQLMSDGLSAIASPAGQIQTPASTVKRRGDMTDVSSNIMVINKTSKVTCQQPGRSHRFSTPINRNSTAHSSAERNPSTIKKRKALDYLSHIPSPPPLSCLSTLSSPSVKKIFIPPRRTEIPGTLKTVKTPNQKPSNTPVDDQWVNDEELAMIDTQAL","is_aligned":0},"location":"scaffold_19:196046-199577","name":"brca2-201","id":[{"source":"EnsEMBL","accession":"ENSTRUP00000015030"}]},"branch_length":0.072273,"id":{"source":"EnsEMBL","accession":"ENSTRUG00000006177"},"confidence":{},"taxonomy":{"scientific_name":"Takifugu rubripes","id":31033}},{"sequence":{"mol_seq":{"seq":"VSFSSDTPRKPKAGSLSSEFTDRFLAQEALDCTKALLEDERLVDDPHMTGECLHRCPQFSLLVNLFVKPHTAVLIPEQPPLKRRLLEEFDRTDGSSRGSALNPEKCSPNGIMGDRRVFKCSVSFQPNITTPHRICSQKAERPVSFLSRRSGTNYVETSLPNTTPTKVSALRDSNEARLQKSNFIPPFIKNVKLDTPNSKTASTFVPPFKKSRNSSKTEEEEPKHHFIPPFTNPCATSSTKKHTAGHLHNVELARDMQGMRIRKKKRQTILPLPGSLFLKKSSGVTRIPLKSAVNGKPPARYTPKQLYGLGVPLNVLEITSETAGSFRFSLQQFVKLESLTDKGGIQLADGGWLIPRNDGTAGKEEFYRALCDTTGVDPKLISEEWVYNHYRWIVWKQASMERSFPEQLGSLCLTPEQVLLQLKYRYDIEVDQSRRPALRKIMERDDTAAKTLILCVCGVVSRGSSPQKQGLGGVAAPSSDPQVENPFAVVWLTDGWYSIKAQLDGPLTSMLNRGRLPVGGKLIIHGAQLVGSQDACSPLEAPESIMLKIFANSSRRARWDAKLGFYRDPRPFLLPVSSLYNSGGPVGCVDIIILRSYPTLWMERKPEGGTVFRSGRAEEKEARRYNVHKEKAMEILFDKIQAEFEKEERDNRKPRSRRRTIGDQDIKSLQDGEELYEAVGDDPAYLEAHLTEQQAETLQNYKRLLIEKKQAELQDRYRRAVETAEDGTGSCPKRDVAPVWRLSIADFMEKPGSVYQLNIWRPPSELQSLLKEGCRYKVYNLTTTDSKKQGGNTTVQLSGTKKTQFEDLQASEELLSTYFQPRVSATFIDLQDPEFHSLCGEVDLTGYVISIIDGQGFSPAFYLTDGKQNFVKVRCFSSFAQSGLEDVIKPSVLLALSNLQLRGQATSPTPVLYAGDLTVFSTNPKEVHLQESFSQLKTLVQ","is_aligned":0},"location":"16:4700614-4705074","name":"brca2-201","id":[{"source":"EnsEMBL","accession":"ENSTNIP00000002435"}]},"branch_length":0.113355,"id":{"source":"EnsEMBL","accession":"ENSTNIG00000016261"},"confidence":{},"taxonomy":{"scientific_name":"Tetraodon nigroviridis","id":99883}}],"confidence":{"bootstrap":100},"taxonomy":{"scientific_name":"Tetraodontidae","id":31031}},{"sequence":{"mol_seq":{"seq":"LPNVELAQDMQDMRIRKKKRQTIRPLPGSLFLTKTSGVTRIPLKAALVFLLQLYRHGVHQHVCEISSETAESFRFNLKQFIKREALLDGGGVQLADGGWLIPSKDGTAGKEEFYRALCDTPGVDPKLISDGWVDNHYRWVVWKQASMERSFPETMGGLCLTPEQVLLQLKYRYDVEVDHSRRPALRRITERDDTAAKTLVLCVCGVVSRSFDDSKTPRGADAGGGNPSAVVWLTDGWYAIRAQLDEPLTAMLRNGRVAVGSKLIVHGAQLVGSQEACSPLEAPEALMLKICANSSRPVRWDAKLGFHKDPRPFLLPLSCLYSSGGQVGCVDMIVLRSYPIQWMERKPEGGVVFRSVRAEEKEAKRFNGLKQKAMEILFAKIQDEFEKEDKGRTCDFTTQAISRQAIAGLQAGEELCEAVGEDPAHLEALLSEQQVETLNTYRRCVMEKKQAQLHDRFQRALESAEASEGSCPKREVTPVWRLGVADSRDQRGRVYQLNLWRPSSDLQALLKEGRRYKVYNLTTSDGKKHNGSSNVQLTGTKKTQFQDLQASREWLSTRFQPRVSACFVDLQNPEFQSLCGEVDLTGFVIQIVDGQGFSPAFYLADGELNFVKVRCFSSFAQSGLEDLVKPRVLLSLSNLQLRGQSASPTPVVYAGDLTVFSANPKDAHLQESLSQNKNLRQSQENFFLIAEETLSRLVQSDGRRPLSSPALHTRTPALATSMIQDTTASVKCVLLMQGASQQLVRSRGTFTPVSRKPPAANCSTEKDAGSVKRRRALNYLSHIPSPPPLLNLGSVASPCVNKTFNPPRRSGTPSTLKTVQTPAHKAQKVDSLVEDEWVNDEELAMIDTQAL","is_aligned":0},"location":"groupI:13362884-13366744","name'..b'EDISTSRNALEIRPELYPEGMCSNRASNGSGNNSEFTAGEGISININQSSLLTTGNVLKNLPSESSGHDVYSVTEHLSTVVKVKRYNDSGHFVNQNLAECNDNHVLSTQKNTANISNRNEDCTSLAPLSFSTASGKSVTVSHDSLQKARLMLSEAANDVTVDTSKQEAAYITPAIRKTEAEKEQNTVDDSDRVNANTFSFSTASGKKVNISGNSLKQVRAVCLSSDPKETSAALFNVEKSVFNEDVKDVSLLQPNVTMPKAVSFSTASGKTVQLSDESLKKARVIFSEIDTCPLMQQQTNESTVEEIVIGGGMTKSKQMPLTTEKVETTRKNNGTFGFNTASGKQVSVSESALQKVKDIFQEFDDPDNYEQNKSLVRLPVSSKIKESTPGTKRLVQTAGSSYKNDNLQCKAGNLRTFQDKQAGKKSLTYSEAAISPIESSVPIYEMQVMLKHTNNQACKYQPRVEVPLQDQRWQNILEIELPATCAPAFRETHNILFFGDLQHSTHFDICSLYSGKNPAVKHQLASHSKMQTLVISGRDSSGTLTLQFTLRIVILHTVNNQYSLNKQLFTFSSALRQVTCIPTQAHLHSKVKIFHQSLPIKSPDVASDSTSKSYSPTAAKETINCSSASKIPAKKFVPPFKKTVATLADNQSNSVQNGSSDGLIESIVYPKEDKVETICSSKDQFDDSDILQMTSNLRCSKDLQEMRIRKKLRQKIKPHPGSLYRLKMSHVKRISLQSAVAERCPTLYSREQLYRYGIVKNHIGVSSENALSFQFHCSNYFTKELLLSGNGVQLADGGWLIPTEQGNAGKEEIYRAFCDTPGVDPKLISAEWVHNHYRWIVWKLAAMEVRFPKTFACRCLTPERVLLQLKYRYDVEIDKSQRSAIKKIMERDDSPAKTLVLCIAKIISQGTRLPNACSNKTEPADSKESSAVIEVTDSWYGIKVLLDPCLTALLHKGRLFIGQKLIVHGAELIGSDDACSPLEAPESLMLKIAANSTRPVRWHTKLGYFKDPRPFCLHLSSLLSEGGVVGCVDVVIQRIYPMQWMEKMANGLYVFRNDRAEEREAEKHSANQQKKLEMLFSKIQAEFEQREVTCNRRKGLRRRSLNAQQMQTLQDGAEIYEAIQNESDPGYLESYLSAEQLKALNHHRQLLNDKKQALIQAEFRKAIECSEQDANGCTRRDVTPVWKLRIADYRNYETDAAYILNIWRPLPDVLSLLKEGCRYKMYHLAASTSKGKSLAADLQLTATKKTRFQQLQLSESILEQIYSPREVTDFSRFQEPLFSAPYAEVDLVGLIISIYKKTGAAPVVYISDESHNIVALKFWTDLGQLGLEEITKPRTYISASNLRWRSDCIEGIPTLYVGDLANISSNPKESHLQRAIQKLKLSVQNVQDFWNSSQTALMKTLQINSTDTTECSKNPTTPTWKSDVSARSGYLTPLHHSGKRLLNSVHTSDPQTENPGCSKEIQLKTCKKRKALDFLNRIPSPPPVTPVRPFVSPSLQKAFRPPRSCSVQKLGPETKGNTENVQGTTPECTKDLAKLEGEFVADEELAMINTQALLLGLEEEKKKTEQKTSRTAGKMTAHESPIENASPVPAQEQQTEEALNIPVGNSEKSYLCLRKRKRK","is_aligned":0},"location":"GL172716.1:1071058-1096238","name":"brca2-201","id":[{"source":"EnsEMBL","accession":"ENSXETP00000060681"}]},"branch_length":0.756151,"id":{"source":"EnsEMBL","accession":"ENSXETG00000017011"},"confidence":{},"taxonomy":{"scientific_name":"Xenopus tropicalis","common_name":"X.tropicalis","id":8364}}],"confidence":{"bootstrap":1},"taxonomy":{"scientific_name":"Tetrapoda","common_name":"Tetrapods","timetree_mya":371.2,"id":32523}},{"sequence":{"mol_seq":{"seq":"MEWEAVESVKALMRDDELTDAGLDASKDSLNRACRRQSGGNFRARKRMRLEQVSADEPPVKRQLLAEFDRTVENGHKSLQKPLICTPNGTLKDRRKFMYSVPLKPVVCGPWSNNSKTGQQVTKPSITLPGRGVETFQPKNHIAPSPVYDPPSNRRGPVFAPPFHGATFRGLQKPSASHTSSKTAKTFVPPFKMKASASHTVHFSSKVINTCEKILENLVYLKPSLASCNIFQSLEEMTANLQCARDLQEMRLRKKQRQNIRPQPGSLYLAKTSGVARVSLKAATGNQCPSSYSTEQLYVHGVGKSTLKVRSENAESFQFSCSDYFGKDVLLAGNGLKLADGGWLIPSDKGMVGKEEFYRALCDTPGVAPKLISESWVYNHYRWIVWKLAAMEAAFPKEFGNRCLTPERVLLQLKYRYDIEVDKCRRSTVKKIMERDDTAAKTLVLCISKLISVEDRFKQTKNKNEKGAEEARKEAVAGVIETTDGWYGIKVLLDPPLTVLVQRGRLSVGCKIITHGAEIIGSQDACTPLEAPECLMLKISANSTRPACWSAKLGFHRDPRPFPLPLASLFNDGGLVGCVDVVVVRLYPIQWMEKKSDGIFVFRNDRAEEREAQRQVENQQRKMESLFAKIQTEFEQKYEAKSKRRGQKAQKFSKQEIQALQDGAELNEAIENSMDPGYFEACLREEQLKVLHGHRQMLNEKKQAEFQAEFKKALESAEQEGKSCCKRGVTPVWKLRIVDYRKPSAAEYILNIWRPLADLHSLLKEGNRYRIYQLLASQSKGRTTTADIQLTATKKTQYQQFQSFPELISELYSPRKAVKFNMLMDPTFRPAYAEVDLVGYTISIEGKPGVAPVVYLSDESHNFVAIKVWTALNQLAVEDIVKPFSLIAASNLQWRSDSRSIIPMLYAGDLSIFSSNPKEGHLQEAFNQRRTAIQENISGTYLPPEKKNLHQESYKSCQYNTLNVLMNGNIHTQSPVLSRVHMGTSCAFLFLLPSPYPESKHTSPLITMKAGVKSMTFPGSAKLMPQASENQELDTPKNRKKKAALDYLCRIPSPPALTPIRSFVSSSLQKAFHPPRSCVKLQSGENPVVPTVGNNAVLGIQSKKDEGPAAFNEEDSVADEELAMINTQAFLVGLRRDKRPSLLDKTASLKGHVPSERFLEEKLLSVLKEQASSNSERNATSLENKSCDKSRTCVKPCEHSNDSIAEETSEIIPGCHGGESAVENQSKNSSLCHKKLQQKKRRKYY","is_aligned":0},"location":"JH127744.1:299190-332700","id":[{"source":"EnsEMBL","accession":"ENSLACP00000008815"}]},"branch_length":0.314542,"id":{"source":"EnsEMBL","accession":"ENSLACG00000007788"},"confidence":{},"taxonomy":{"scientific_name":"Latimeria chalumnae","common_name":"Coelacanth","id":7897}}],"confidence":{"bootstrap":1},"taxonomy":{"scientific_name":"Sarcopterygii","common_name":"Lobe-finned fish","timetree_mya":414.9,"id":8287}}],"confidence":{},"taxonomy":{"scientific_name":"Euteleostomi","common_name":"Bony vertebrates","timetree_mya":441,"id":117571}},"rooted":1,"id":"ENSGT00390000003602","type":"gene tree"}\n'
b
diff -r 76b2c482f1e8 -r e5dd4bd78bbc test-data/genetree.phyloxml
--- a/test-data/genetree.phyloxml Thu Aug 11 14:29:50 2016 -0400
+++ b/test-data/genetree.phyloxml Mon Dec 12 07:47:42 2016 -0500
b
b'@@ -4,171 +4,52 @@\n   <phylogeny rooted="true" type="gene tree">\n     <clade branch_length="0">\n       <taxonomy>\n+        <scientific_name>Euteleostomi</scientific_name>\n         <id>117571</id>\n-        <scientific_name>Euteleostomi</scientific_name>\n+        <common_name>Bony vertebrates</common_name>\n       </taxonomy>\n-      <clade branch_length="0.153275">\n-        <confidence type="bootstrap">95</confidence>\n+      <clade branch_length="0.149761">\n+        <confidence type="bootstrap">92</confidence>\n         <taxonomy>\n+          <scientific_name>Neopterygii</scientific_name>\n           <id>41665</id>\n-          <scientific_name>Neopterygii</scientific_name>\n+          <common_name>Ray-finned fishes</common_name>\n         </taxonomy>\n-        <clade branch_length="0.155187">\n-          <confidence type="bootstrap">19</confidence>\n+        <clade branch_length="0.148891">\n+          <confidence type="bootstrap">33</confidence>\n           <taxonomy>\n+            <scientific_name>Clupeocephala</scientific_name>\n             <id>186625</id>\n-            <scientific_name>Clupeocephala</scientific_name>\n+            <common_name>Teleost fishes</common_name>\n           </taxonomy>\n-          <clade branch_length="0.133192">\n-            <confidence type="bootstrap">18</confidence>\n+          <clade branch_length="0.181209">\n+            <confidence type="bootstrap">46</confidence>\n             <taxonomy>\n-              <id>123368</id>\n-              <scientific_name>Acanthomorphata</scientific_name>\n+              <scientific_name>Otophysi</scientific_name>\n+              <id>186626</id>\n+              <common_name>Teleost fishes</common_name>\n             </taxonomy>\n-            <clade branch_length="0.374304">\n-              <name>ENSGMOG00000009699</name>\n+            <clade branch_length="0.421267">\n+              <name>ENSAMXG00000013027</name>\n               <taxonomy>\n-                <id>8049</id>\n-                <scientific_name>Gadus morhua</scientific_name>\n+                <id>7994</id>\n+                <scientific_name>Astyanax mexicanus</scientific_name>\n+                <common_name>Cave fish</common_name>\n               </taxonomy>\n               <sequence>\n-                <accession source="Ensembl">ENSGMOP00000010385</accession>\n+                <accession source="Ensembl">ENSAMXP00000013440</accession>\n                 <name>brca2-201</name>\n-                <location>GeneScaffold_2233:16156-29802</location>\n-                <mol_seq is_aligned="0">LARDLQDMRLRKKKRQTVRPLPGSLFLAKASGGARIPLRAALRQLYQHGVHQPVWTVTAENAESFRLSFRRFFRWGSSVSRGVQLADGGWLVPRDDWTLGKEEFYRALCDSPGVDVKLLSQEWAYNHYRWVVWKLASMERSFPLTMASLWLNPEQILLQLKYRYDVEVDHSRRPALRKITERDDAAAKTLVLCVCGVVPGADQQPQGSHAPPPGVVWLTDGWYAIKAQLDAPLTAMLRRGGAGGKLVVYGAELVGSQDGCSPLEAPEGLMLKIGANSCRRARWDAKLGFQRDPRPFLLRLSSLFSTGGAVGCVDLLILRSYPVLWMEKKQDGVFVFRSGRAEEREARRFDDHNNKTMEALYAKIQADIQREDKGSARERNSGEELYEAFENDPAYLEACLNDQQLEVLQSYRRSVLEKRQAGLQERCRRALEQAQESQGGCPRRDVTPVWKLCVVDARAPPGYMLNVWRPPADLQAQLKEGARYRVYNLSVTAGKKRNPGASVQLTATSKTHFQEVQVGQDWLSDHFQARQAVHFQELQRPEVQSACGEVDLVGYVVTTADTHGTSPVVYLVDGDLNLVKVRCFSSLLQWGLEELVKPATLLALSNLQLSARRATTLPVLYASDLTAFSSNPREAHLQSSHSNADRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXRRRALDYLSRMPSPPPLGPLGSLASPACVKKTFNPPRRSTTPAAVATTRQTPAHGPRVGPWQEEEWENDEGLAQIDTQVL</mol_seq>\n+                <location>KB882257.1:1212119-1232402</location>\n+                <mol_seq is_aligned="0">MIEDFFHQIETEELGPLSSDWFKELTAKASKDESFPGAVEHEASSRTEDGTFRAPQETPALESQMSSTPRLFRVRGPLSPDSVLGRSPNPAFQQQGMQTPLSTLPWTDSSPCLFGSAKESQRFEEDSEPLKKHDYFGLLDTPKSSLVQDSSAKRISESLGAQLHPDLSWSSNFNTPGAMSPTVILTKKDAQPSPVSFLKDKEVIIVRKLFPSLSKGTESTSEITSTAHNNASLTEENAQTKGPDESFDNVEGLWRQTVPDAINDSDVRDTVESVLDGAEDVLSIFFSNSSSALRRVKSKERTKKRVNGVSKDVKPAALATQHYTDRTGTATEAKSPPKNKDFSQWSPLSLSQVSDAKAKQDCTSDLANKHLNFESVNSDSGEDALKDSITQLGQFNTCEVEKQKSSPDSYPEKSQLRSSLLAASPALTFSRKPRKFVYQVQSPFPPTKENDLTGKHYREPFLTAENKKHIKDDPQTSDTVGDCLVHPKEPPVKQGNPGALNVDHGLDMTQLCNAFAEDFTQEIRSDAIKIDEVQTKAESVLHSDDGAEHLEFPANHKEASALAEESINLTSRSRMENTLSESLKHENGYPA'..b'IDNDKELLSHAKEESRKSNIKHSSKLINNENCGKTEHTSEHLVCQSNVSLPFIKTLGKNATILVSGNEKSQNQELQLDCLKCESTDVKHTPQKETINDNSKCNESSLSELSMKGFQTASGRNIMMSESSIQKARNIFAEEHEDSFTLRCNIQNTIQIPQPVNEPTQFPYVNLGPKPTTTSGWQEKNILRRSTEKGFMPGFCTAGGKKVSVSDNSLAKAHKLFQEECTFSKEGKLDEVKQNKLMNSEPLSLLTCESVLKQSDGFIEDISTSRNALEIRPELYPEGMCSNRASNGSGNNSEFTAGEGISININQSSLLTTGNVLKNLPSESSGHDVYSVTEHLSTVVKVKRYNDSGHFVNQNLAECNDNHVLSTQKNTANISNRNEDCTSLAPLSFSTASGKSVTVSHDSLQKARLMLSEAANDVTVDTSKQEAAYITPAIRKTEAEKEQNTVDDSDRVNANTFSFSTASGKKVNISGNSLKQVRAVCLSSDPKETSAALFNVEKSVFNEDVKDVSLLQPNVTMPKAVSFSTASGKTVQLSDESLKKARVIFSEIDTCPLMQQQTNESTVEEIVIGGGMTKSKQMPLTTEKVETTRKNNGTFGFNTASGKQVSVSESALQKVKDIFQEFDDPDNYEQNKSLVRLPVSSKIKESTPGTKRLVQTAGSSYKNDNLQCKAGNLRTFQDKQAGKKSLTYSEAAISPIESSVPIYEMQVMLKHTNNQACKYQPRVEVPLQDQRWQNILEIELPATCAPAFRETHNILFFGDLQHSTHFDICSLYSGKNPAVKHQLASHSKMQTLVISGRDSSGTLTLQFTLRIVILHTVNNQYSLNKQLFTFSSALRQVTCIPTQAHLHSKVKIFHQSLPIKSPDVASDSTSKSYSPTAAKETINCSSASKIPAKKFVPPFKKTVATLADNQSNSVQNGSSDGLIESIVYPKEDKVETICSSKDQFDDSDILQMTSNLRCSKDLQEMRIRKKLRQKIKPHPGSLYRLKMSHVKRISLQSAVAERCPTLYSREQLYRYGIVKNHIGVSSENALSFQFHCSNYFTKELLLSGNGVQLADGGWLIPTEQGNAGKEEIYRAFCDTPGVDPKLISAEWVHNHYRWIVWKLAAMEVRFPKTFACRCLTPERVLLQLKYRYDVEIDKSQRSAIKKIMERDDSPAKTLVLCIAKIISQGTRLPNACSNKTEPADSKESSAVIEVTDSWYGIKVLLDPCLTALLHKGRLFIGQKLIVHGAELIGSDDACSPLEAPESLMLKIAANSTRPVRWHTKLGYFKDPRPFCLHLSSLLSEGGVVGCVDVVIQRIYPMQWMEKMANGLYVFRNDRAEEREAEKHSANQQKKLEMLFSKIQAEFEQREVTCNRRKGLRRRSLNAQQMQTLQDGAEIYEAIQNESDPGYLESYLSAEQLKALNHHRQLLNDKKQALIQAEFRKAIECSEQDANGCTRRDVTPVWKLRIADYRNYETDAAYILNIWRPLPDVLSLLKEGCRYKMYHLAASTSKGKSLAADLQLTATKKTRFQQLQLSESILEQIYSPREVTDFSRFQEPLFSAPYAEVDLVGLIISIYKKTGAAPVVYISDESHNIVALKFWTDLGQLGLEEITKPRTYISASNLRWRSDCIEGIPTLYVGDLANISSNPKESHLQRAIQKLKLSVQNVQDFWNSSQTALMKTLQINSTDTTECSKNPTTPTWKSDVSARSGYLTPLHHSGKRLLNSVHTSDPQTENPGCSKEIQLKTCKKRKALDFLNRIPSPPPVTPVRPFVSPSLQKAFRPPRSCSVQKLGPETKGNTENVQGTTPECTKDLAKLEGEFVADEELAMINTQALLLGLEEEKKKTEQKTSRTAGKMTAHESPIENASPVPAQEQQTEEALNIPVGNSEKSYLCLRKRKRK</mol_seq>\n-            </sequence>\n-            <property datatype="xsd:string" ref="Compara:genome_db_name" applies_to="clade">xenopus_tropicalis</property>\n-          </clade>\n+        </clade>\n+        <clade branch_length="0.314542">\n+          <name>ENSLACG00000007788</name>\n+          <taxonomy>\n+            <id>7897</id>\n+            <scientific_name>Latimeria chalumnae</scientific_name>\n+            <common_name>Coelacanth</common_name>\n+          </taxonomy>\n+          <sequence>\n+            <accession source="Ensembl">ENSLACP00000008815</accession>\n+            <location>JH127744.1:299190-332700</location>\n+            <mol_seq is_aligned="0">MEWEAVESVKALMRDDELTDAGLDASKDSLNRACRRQSGGNFRARKRMRLEQVSADEPPVKRQLLAEFDRTVENGHKSLQKPLICTPNGTLKDRRKFMYSVPLKPVVCGPWSNNSKTGQQVTKPSITLPGRGVETFQPKNHIAPSPVYDPPSNRRGPVFAPPFHGATFRGLQKPSASHTSSKTAKTFVPPFKMKASASHTVHFSSKVINTCEKILENLVYLKPSLASCNIFQSLEEMTANLQCARDLQEMRLRKKQRQNIRPQPGSLYLAKTSGVARVSLKAATGNQCPSSYSTEQLYVHGVGKSTLKVRSENAESFQFSCSDYFGKDVLLAGNGLKLADGGWLIPSDKGMVGKEEFYRALCDTPGVAPKLISESWVYNHYRWIVWKLAAMEAAFPKEFGNRCLTPERVLLQLKYRYDIEVDKCRRSTVKKIMERDDTAAKTLVLCISKLISVEDRFKQTKNKNEKGAEEARKEAVAGVIETTDGWYGIKVLLDPPLTVLVQRGRLSVGCKIITHGAEIIGSQDACTPLEAPECLMLKISANSTRPACWSAKLGFHRDPRPFPLPLASLFNDGGLVGCVDVVVVRLYPIQWMEKKSDGIFVFRNDRAEEREAQRQVENQQRKMESLFAKIQTEFEQKYEAKSKRRGQKAQKFSKQEIQALQDGAELNEAIENSMDPGYFEACLREEQLKVLHGHRQMLNEKKQAEFQAEFKKALESAEQEGKSCCKRGVTPVWKLRIVDYRKPSAAEYILNIWRPLADLHSLLKEGNRYRIYQLLASQSKGRTTTADIQLTATKKTQYQQFQSFPELISELYSPRKAVKFNMLMDPTFRPAYAEVDLVGYTISIEGKPGVAPVVYLSDESHNFVAIKVWTALNQLAVEDIVKPFSLIAASNLQWRSDSRSIIPMLYAGDLSIFSSNPKEGHLQEAFNQRRTAIQENISGTYLPPEKKNLHQESYKSCQYNTLNVLMNGNIHTQSPVLSRVHMGTSCAFLFLLPSPYPESKHTSPLITMKAGVKSMTFPGSAKLMPQASENQELDTPKNRKKKAALDYLCRIPSPPALTPIRSFVSSSLQKAFHPPRSCVKLQSGENPVVPTVGNNAVLGIQSKKDEGPAAFNEEDSVADEELAMINTQAFLVGLRRDKRPSLLDKTASLKGHVPSERFLEEKLLSVLKEQASSNSERNATSLENKSCDKSRTCVKPCEHSNDSIAEETSEIIPGCHGGESAVENQSKNSSLCHKKLQQKKRRKYY</mol_seq>\n+          </sequence>\n+          <property datatype="xsd:string" ref="Compara:genome_db_name" applies_to="clade">latimeria_chalumnae</property>\n         </clade>\n       </clade>\n     </clade>\n'