Mercurial > repos > earlhaminst > ensembl_get_sequences
changeset 7:c79ce2342f1e draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/Ensembl-REST commit 8f8808de862973aedbf87abd4dfa9d2dc7219322
author | earlhaminst |
---|---|
date | Mon, 17 Feb 2025 14:49:24 +0000 |
parents | 7af66c2b3831 |
children | |
files | get_feature_info.py get_genetree.py get_sequences.py get_sequences.xml macros.xml |
diffstat | 5 files changed, 163 insertions(+), 83 deletions(-) [+] |
line wrap: on
line diff
--- a/get_feature_info.py Tue Oct 20 15:07:23 2020 +0000 +++ b/get_feature_info.py Mon Feb 17 14:49:24 2025 +0000 @@ -1,37 +1,45 @@ # A simple tool to connect to the Ensembl server and retrieve feature # information using the Ensembl REST API. -from __future__ import print_function - import json import optparse from itertools import islice +from urllib.parse import urljoin import requests -from six.moves.urllib.parse import urljoin parser = optparse.OptionParser() -parser.add_option('-i', '--input', help='List of Ensembl IDs') -parser.add_option('-e', '--expand', type='choice', choices=['0', '1'], - default='0', - help='Expands the search to include any connected features. e.g. If the object is a gene, its transcripts, translations and exons will be returned as well.') +parser.add_option("-i", "--input", help="List of Ensembl IDs") +parser.add_option( + "-e", + "--expand", + type="choice", + choices=["0", "1"], + default="0", + help="Expands the search to include any connected features. e.g. If the object is a gene, its transcripts, translations and exons will be returned as well.", +) -parser.add_option('-f', '--format', type='choice', - choices=['full', 'condensed'], default='full', - help='Specify the formats to emit from this endpoint') +parser.add_option( + "-f", + "--format", + type="choice", + choices=["full", "condensed"], + default="full", + help="Specify the formats to emit from this endpoint", +) options, args = parser.parse_args() if options.input is None: - raise Exception('-i option must be specified') + raise Exception("-i option must be specified") -server = 'https://rest.ensembl.org' -ext = 'lookup/id' +server = "https://rest.ensembl.org" +ext = "lookup/id" -headers = {'Content-Type': 'application/json', 'Accept': 'application/json'} -params = dict((k, getattr(options, k)) for k in ['format', 'expand']) +headers = {"Content-Type": "application/json", "Accept": "application/json"} +params = {k: getattr(options, k) for k in ("format", "expand")} first = True -print('{') +print("{") with open(options.input) as f: while True: @@ -40,9 +48,14 @@ break if not first: print(",") - data = {'ids': ids} - r = requests.post(urljoin(server, ext), params=params, headers=headers, - data=json.dumps(data), allow_redirects=False) + data = {"ids": ids} + r = requests.post( + urljoin(server, ext), + params=params, + headers=headers, + data=json.dumps(data), + allow_redirects=False, + ) if not r.ok: r.raise_for_status() @@ -51,4 +64,4 @@ first = False -print('}') +print("}")
--- a/get_genetree.py Tue Oct 20 15:07:23 2020 +0000 +++ b/get_genetree.py Mon Feb 17 14:49:24 2025 +0000 @@ -1,54 +1,93 @@ # A simple tool to connect to the Ensembl server and retrieve genetree using # the Ensembl REST API. -from __future__ import print_function - import optparse +from urllib.parse import urljoin import requests -from six.moves.urllib.parse import urljoin parser = optparse.OptionParser() -parser.add_option('--id_type', type='choice', default='gene_id', - choices=['gene_id', 'gene_tree_id'], help='Input type') -parser.add_option('-i', '--input', help='Ensembl ID') -parser.add_option('--format', type='choice', - choices=['json', 'orthoxml', 'phyloxml', 'nh'], - default='json', help='Output format') -parser.add_option('-s', '--sequence', type='choice', - choices=['protein', 'cdna', 'none'], default='protein', - help='The type of sequence to bring back. Setting it to none results in no sequence being returned') +parser.add_option( + "--id_type", + type="choice", + default="gene_id", + choices=["gene_id", "gene_tree_id"], + help="Input type", +) +parser.add_option("--species", help="Species name/alias") +parser.add_option("-i", "--input", help="Ensembl ID") +parser.add_option( + "--format", + type="choice", + choices=["json", "orthoxml", "phyloxml", "nh"], + default="json", + help="Output format", +) +parser.add_option( + "-s", + "--sequence", + type="choice", + choices=["protein", "cdna", "none"], + default="protein", + help="The type of sequence to bring back. Setting it to none results in no sequence being returned", +) -parser.add_option('-a', '--aligned', type='choice', choices=['0', '1'], - default='0', help='Return the aligned string if true. Otherwise, return the original sequence (no insertions)') -parser.add_option('-c', '--cigar_line', type='choice', choices=['0', '1'], - default='0', - help='Return the aligned sequence encoded in CIGAR format') -parser.add_option('--nh_format', type='choice', - choices=['full', 'display_label_composite', 'simple', 'species', 'species_short_name', 'ncbi_taxon', 'ncbi_name', 'njtree', 'phylip'], - default='simple', - help='The format of a NH (New Hampshire) request') +parser.add_option( + "-a", + "--aligned", + type="choice", + choices=["0", "1"], + default="0", + help="Return the aligned string if true. Otherwise, return the original sequence (no insertions)", +) +parser.add_option( + "-c", + "--cigar_line", + type="choice", + choices=["0", "1"], + default="0", + help="Return the aligned sequence encoded in CIGAR format", +) +parser.add_option( + "--nh_format", + type="choice", + choices=[ + "full", + "display_label_composite", + "simple", + "species", + "species_short_name", + "ncbi_taxon", + "ncbi_name", + "njtree", + "phylip", + ], + default="simple", + help="The format of a NH (New Hampshire) request", +) options, args = parser.parse_args() if options.input is None: - raise Exception('-i option must be specified') + raise Exception("-i option must be specified") -server = 'https://rest.ensembl.org' +server = "https://rest.ensembl.org" -if options.id_type == 'gene_id': - ext = 'genetree/member/id' -elif options.id_type == 'gene_tree_id': - ext = 'genetree/id' +if options.id_type == "gene_id": + ext = f"genetree/member/id/{options.species}/{options.input}" +elif options.id_type == "gene_tree_id": + ext = f"genetree/id/{options.input}" -if options.format == 'json': - content_type = 'application/json' -elif options.format == 'orthoxml': - content_type = 'text/x-orthoxml+xml' -elif options.format == 'phyloxml': - content_type = 'text/x-phyloxml+xml' -elif options.format == 'nh': - content_type = 'text/x-nh' -headers = {'Content-Type': content_type} -params = dict((k, getattr(options, k)) for k in ['sequence', 'aligned', 'cigar_line', 'nh_format']) -r = requests.get(urljoin(server, '/'.join([ext, options.input])), params=params, headers=headers) +if options.format == "json": + content_type = "application/json" +elif options.format == "orthoxml": + content_type = "text/x-orthoxml+xml" +elif options.format == "phyloxml": + content_type = "text/x-phyloxml+xml" +elif options.format == "nh": + content_type = "text/x-nh" +headers = {"Content-Type": content_type} +params = { + k: getattr(options, k) for k in ("sequence", "aligned", "cigar_line", "nh_format") +} +r = requests.get(urljoin(server, ext), params=params, headers=headers) if not r.ok: r.raise_for_status()
--- a/get_sequences.py Tue Oct 20 15:07:23 2020 +0000 +++ b/get_sequences.py Mon Feb 17 14:49:24 2025 +0000 @@ -1,42 +1,60 @@ # A simple tool to connect to the Ensembl server and retrieve sequences using # the Ensembl REST API. -from __future__ import print_function - import json import optparse from itertools import islice +from urllib.parse import urljoin import requests -from six.moves.urllib.parse import urljoin parser = optparse.OptionParser() -parser.add_option('-i', '--input', help='List of Ensembl IDs') +parser.add_option("-i", "--input", help="List of Ensembl IDs") -parser.add_option('-t', '--type', type='choice', - choices=['genomic', 'cds', 'cdna', 'protein'], - default='genomic', help='Type of sequence') -parser.add_option('--expand_3prime', type='int', default=0, - help='Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type') -parser.add_option('--expand_5prime', type='int', default=0, - help='Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type') +parser.add_option( + "-t", + "--type", + type="choice", + choices=["genomic", "cds", "cdna", "protein"], + default="genomic", + help="Type of sequence", +) +parser.add_option( + "--expand_3prime", + type="int", + default=0, + help="Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type", +) +parser.add_option( + "--expand_5prime", + type="int", + default=0, + help="Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type", +) options, args = parser.parse_args() if options.input is None: - raise Exception('-i option must be specified') + raise Exception("-i option must be specified") + +server = "https://rest.ensembl.org" +ext = "sequence/id" -server = 'https://rest.ensembl.org' -ext = 'sequence/id' - -headers = {'Content-Type': 'text/x-fasta', 'Accept': 'text/x-fasta'} -params = dict((k, getattr(options, k)) for k in ['type', 'expand_3prime', 'expand_5prime']) +headers = {"Content-Type": "text/x-fasta", "Accept": "text/x-fasta"} +params = { + k: getattr(options, k) for k in ("type", "expand_3prime", "expand_5prime") +} with open(options.input) as f: # Need to split the file in chunks of 50 lines because of the limit imposed by Ensembl while True: ids = [line.strip() for line in islice(f, 50)] if not ids: break - data = {'ids': ids} - r = requests.post(urljoin(server, ext), params=params, headers=headers, - data=json.dumps(data), allow_redirects=False) + data = {"ids": ids} + r = requests.post( + urljoin(server, ext), + params=params, + headers=headers, + data=json.dumps(data), + allow_redirects=False, + ) if not r.ok: r.raise_for_status()
--- a/get_sequences.xml Tue Oct 20 15:07:23 2020 +0000 +++ b/get_sequences.xml Mon Feb 17 14:49:24 2025 +0000 @@ -1,8 +1,11 @@ -<tool id="get_sequences" name="Get sequences by Ensembl ID" version="0.1.2"> +<tool id="get_sequences" name="Get sequences by Ensembl ID" version="1.0.0"> <description>using REST API</description> + <macros> + <import>macros.xml</import> + </macros> <requirements> - <requirement type="package" version="2.12.4">requests</requirement> - <requirement type="package" version="1.10.0">six</requirement> + <requirement type="package" version="3.13">python</requirement> + <requirement type="package" version="2.32.3">requests</requirement> </requirements> <command detect_errors="exit_code"> <![CDATA[ @@ -57,6 +60,5 @@ .. _"POST sequence/id": https://rest.ensembl.org/documentation/info/sequence_id_post ]]> </help> - <citations> - </citations> + <expand macro="citations" /> </tool>