Mercurial > repos > earlhaminst > ensembl_get_sequences

--- a/get_feature_info.py	Tue Oct 20 15:07:23 2020 +0000
+++ b/get_feature_info.py	Mon Feb 17 14:49:24 2025 +0000
@@ -1,37 +1,45 @@
 # A simple tool to connect to the Ensembl server and retrieve feature
 # information using the Ensembl REST API.
-from __future__ import print_function
-
 import json
 import optparse
 from itertools import islice
+from urllib.parse import urljoin

 import requests
-from six.moves.urllib.parse import urljoin

 parser = optparse.OptionParser()
-parser.add_option('-i', '--input', help='List of Ensembl IDs')
-parser.add_option('-e', '--expand', type='choice', choices=['0', '1'],
-                  default='0',
-                  help='Expands the search to include any connected features. e.g. If the object is a gene, its transcripts, translations and exons will be returned as well.')
+parser.add_option("-i", "--input", help="List of Ensembl IDs")
+parser.add_option(
+    "-e",
+    "--expand",
+    type="choice",
+    choices=["0", "1"],
+    default="0",
+    help="Expands the search to include any connected features. e.g. If the object is a gene, its transcripts, translations and exons will be returned as well.",
+)

-parser.add_option('-f', '--format', type='choice',
-                  choices=['full', 'condensed'], default='full',
-                  help='Specify the formats to emit from this endpoint')
+parser.add_option(
+    "-f",
+    "--format",
+    type="choice",
+    choices=["full", "condensed"],
+    default="full",
+    help="Specify the formats to emit from this endpoint",
+)
 options, args = parser.parse_args()
 if options.input is None:
-    raise Exception('-i option must be specified')
+    raise Exception("-i option must be specified")


-server = 'https://rest.ensembl.org'
-ext = 'lookup/id'
+server = "https://rest.ensembl.org"
+ext = "lookup/id"

-headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
-params = dict((k, getattr(options, k)) for k in ['format', 'expand'])
+headers = {"Content-Type": "application/json", "Accept": "application/json"}
+params = {k: getattr(options, k) for k in ("format", "expand")}

 first = True

-print('{')
+print("{")

 with open(options.input) as f:
     while True:
@@ -40,9 +48,14 @@
             break
         if not first:
             print(",")
-        data = {'ids': ids}
-        r = requests.post(urljoin(server, ext), params=params, headers=headers,
-                          data=json.dumps(data), allow_redirects=False)
+        data = {"ids": ids}
+        r = requests.post(
+            urljoin(server, ext),
+            params=params,
+            headers=headers,
+            data=json.dumps(data),
+            allow_redirects=False,
+        )

         if not r.ok:
             r.raise_for_status()
@@ -51,4 +64,4 @@

         first = False

-print('}')
+print("}")
--- a/get_genetree.py	Tue Oct 20 15:07:23 2020 +0000
+++ b/get_genetree.py	Mon Feb 17 14:49:24 2025 +0000
@@ -1,54 +1,93 @@
 # A simple tool to connect to the Ensembl server and retrieve genetree using
 # the Ensembl REST API.
-from __future__ import print_function
-
 import optparse
+from urllib.parse import urljoin

 import requests
-from six.moves.urllib.parse import urljoin

 parser = optparse.OptionParser()
-parser.add_option('--id_type', type='choice', default='gene_id',
-                  choices=['gene_id', 'gene_tree_id'], help='Input type')
-parser.add_option('-i', '--input', help='Ensembl ID')
-parser.add_option('--format', type='choice',
-                  choices=['json', 'orthoxml', 'phyloxml', 'nh'],
-                  default='json', help='Output format')
-parser.add_option('-s', '--sequence', type='choice',
-                  choices=['protein', 'cdna', 'none'], default='protein',
-                  help='The type of sequence to bring back. Setting it to none results in no sequence being returned')
+parser.add_option(
+    "--id_type",
+    type="choice",
+    default="gene_id",
+    choices=["gene_id", "gene_tree_id"],
+    help="Input type",
+)
+parser.add_option("--species", help="Species name/alias")
+parser.add_option("-i", "--input", help="Ensembl ID")
+parser.add_option(
+    "--format",
+    type="choice",
+    choices=["json", "orthoxml", "phyloxml", "nh"],
+    default="json",
+    help="Output format",
+)
+parser.add_option(
+    "-s",
+    "--sequence",
+    type="choice",
+    choices=["protein", "cdna", "none"],
+    default="protein",
+    help="The type of sequence to bring back. Setting it to none results in no sequence being returned",
+)

-parser.add_option('-a', '--aligned', type='choice', choices=['0', '1'],
-                  default='0', help='Return the aligned string if true. Otherwise, return the original sequence (no insertions)')
-parser.add_option('-c', '--cigar_line', type='choice', choices=['0', '1'],
-                  default='0',
-                  help='Return the aligned sequence encoded in CIGAR format')
-parser.add_option('--nh_format', type='choice',
-                  choices=['full', 'display_label_composite', 'simple', 'species', 'species_short_name', 'ncbi_taxon', 'ncbi_name', 'njtree', 'phylip'],
-                  default='simple',
-                  help='The format of a NH (New Hampshire) request')
+parser.add_option(
+    "-a",
+    "--aligned",
+    type="choice",
+    choices=["0", "1"],
+    default="0",
+    help="Return the aligned string if true. Otherwise, return the original sequence (no insertions)",
+)
+parser.add_option(
+    "-c",
+    "--cigar_line",
+    type="choice",
+    choices=["0", "1"],
+    default="0",
+    help="Return the aligned sequence encoded in CIGAR format",
+)
+parser.add_option(
+    "--nh_format",
+    type="choice",
+    choices=[
+        "full",
+        "display_label_composite",
+        "simple",
+        "species",
+        "species_short_name",
+        "ncbi_taxon",
+        "ncbi_name",
+        "njtree",
+        "phylip",
+    ],
+    default="simple",
+    help="The format of a NH (New Hampshire) request",
+)
 options, args = parser.parse_args()
 if options.input is None:
-    raise Exception('-i option must be specified')
+    raise Exception("-i option must be specified")

-server = 'https://rest.ensembl.org'
+server = "https://rest.ensembl.org"

-if options.id_type == 'gene_id':
-    ext = 'genetree/member/id'
-elif options.id_type == 'gene_tree_id':
-    ext = 'genetree/id'
+if options.id_type == "gene_id":
+    ext = f"genetree/member/id/{options.species}/{options.input}"
+elif options.id_type == "gene_tree_id":
+    ext = f"genetree/id/{options.input}"

-if options.format == 'json':
-    content_type = 'application/json'
-elif options.format == 'orthoxml':
-    content_type = 'text/x-orthoxml+xml'
-elif options.format == 'phyloxml':
-    content_type = 'text/x-phyloxml+xml'
-elif options.format == 'nh':
-    content_type = 'text/x-nh'
-headers = {'Content-Type': content_type}
-params = dict((k, getattr(options, k)) for k in ['sequence', 'aligned', 'cigar_line', 'nh_format'])
-r = requests.get(urljoin(server, '/'.join([ext, options.input])), params=params, headers=headers)
+if options.format == "json":
+    content_type = "application/json"
+elif options.format == "orthoxml":
+    content_type = "text/x-orthoxml+xml"
+elif options.format == "phyloxml":
+    content_type = "text/x-phyloxml+xml"
+elif options.format == "nh":
+    content_type = "text/x-nh"
+headers = {"Content-Type": content_type}
+params = {
+    k: getattr(options, k) for k in ("sequence", "aligned", "cigar_line", "nh_format")
+}
+r = requests.get(urljoin(server, ext), params=params, headers=headers)

 if not r.ok:
     r.raise_for_status()
--- a/get_sequences.py	Tue Oct 20 15:07:23 2020 +0000
+++ b/get_sequences.py	Mon Feb 17 14:49:24 2025 +0000
@@ -1,42 +1,60 @@
 # A simple tool to connect to the Ensembl server and retrieve sequences using
 # the Ensembl REST API.
-from __future__ import print_function
-
 import json
 import optparse
 from itertools import islice
+from urllib.parse import urljoin

 import requests
-from six.moves.urllib.parse import urljoin

 parser = optparse.OptionParser()
-parser.add_option('-i', '--input', help='List of Ensembl IDs')
+parser.add_option("-i", "--input", help="List of Ensembl IDs")

-parser.add_option('-t', '--type', type='choice',
-                  choices=['genomic', 'cds', 'cdna', 'protein'],
-                  default='genomic', help='Type of sequence')
-parser.add_option('--expand_3prime', type='int', default=0,
-                  help='Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type')
-parser.add_option('--expand_5prime', type='int', default=0,
-                  help='Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type')
+parser.add_option(
+    "-t",
+    "--type",
+    type="choice",
+    choices=["genomic", "cds", "cdna", "protein"],
+    default="genomic",
+    help="Type of sequence",
+)
+parser.add_option(
+    "--expand_3prime",
+    type="int",
+    default=0,
+    help="Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type",
+)
+parser.add_option(
+    "--expand_5prime",
+    type="int",
+    default=0,
+    help="Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type",
+)
 options, args = parser.parse_args()
 if options.input is None:
-    raise Exception('-i option must be specified')
+    raise Exception("-i option must be specified")
+
+server = "https://rest.ensembl.org"
+ext = "sequence/id"

-server = 'https://rest.ensembl.org'
-ext = 'sequence/id'
-
-headers = {'Content-Type': 'text/x-fasta', 'Accept': 'text/x-fasta'}
-params = dict((k, getattr(options, k)) for k in ['type', 'expand_3prime', 'expand_5prime'])
+headers = {"Content-Type": "text/x-fasta", "Accept": "text/x-fasta"}
+params = {
+    k: getattr(options, k) for k in ("type", "expand_3prime", "expand_5prime")
+}
 with open(options.input) as f:
     # Need to split the file in chunks of 50 lines because of the limit imposed by Ensembl
     while True:
         ids = [line.strip() for line in islice(f, 50)]
         if not ids:
             break
-        data = {'ids': ids}
-        r = requests.post(urljoin(server, ext), params=params, headers=headers,
-                          data=json.dumps(data), allow_redirects=False)
+        data = {"ids": ids}
+        r = requests.post(
+            urljoin(server, ext),
+            params=params,
+            headers=headers,
+            data=json.dumps(data),
+            allow_redirects=False,
+        )

         if not r.ok:
             r.raise_for_status()
--- a/get_sequences.xml	Tue Oct 20 15:07:23 2020 +0000
+++ b/get_sequences.xml	Mon Feb 17 14:49:24 2025 +0000
@@ -1,8 +1,11 @@
-<tool id="get_sequences" name="Get sequences by Ensembl ID" version="0.1.2">
+<tool id="get_sequences" name="Get sequences by Ensembl ID" version="1.0.0">
     <description>using REST API</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
     <requirements>
-        <requirement type="package" version="2.12.4">requests</requirement>
-        <requirement type="package" version="1.10.0">six</requirement>
+        <requirement type="package" version="3.13">python</requirement>
+        <requirement type="package" version="2.32.3">requests</requirement>
     </requirements>
     <command detect_errors="exit_code">
 <![CDATA[
@@ -57,6 +60,5 @@
 .. _"POST sequence/id": https://rest.ensembl.org/documentation/info/sequence_id_post
 ]]>
     </help>
-    <citations>
-    </citations>
+    <expand macro="citations" />
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Feb 17 14:49:24 2025 +0000
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<macros>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1093/nar/gkae1071</citation>
+        </citations>
+    </xml>
+</macros>