comparison get_sequences.py @ 8:935de83b470b draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/Ensembl-REST commit 8f8808de862973aedbf87abd4dfa9d2dc7219322
author earlhaminst
date Mon, 17 Feb 2025 14:49:15 +0000
parents 0618e3bd5138
children
comparison
equal deleted inserted replaced
7:515e7181d5e9 8:935de83b470b
1 # A simple tool to connect to the Ensembl server and retrieve sequences using 1 # A simple tool to connect to the Ensembl server and retrieve sequences using
2 # the Ensembl REST API. 2 # the Ensembl REST API.
3 from __future__ import print_function
4
5 import json 3 import json
6 import optparse 4 import optparse
7 from itertools import islice 5 from itertools import islice
6 from urllib.parse import urljoin
8 7
9 import requests 8 import requests
10 from six.moves.urllib.parse import urljoin
11 9
12 parser = optparse.OptionParser() 10 parser = optparse.OptionParser()
13 parser.add_option('-i', '--input', help='List of Ensembl IDs') 11 parser.add_option("-i", "--input", help="List of Ensembl IDs")
14 12
15 parser.add_option('-t', '--type', type='choice', 13 parser.add_option(
16 choices=['genomic', 'cds', 'cdna', 'protein'], 14 "-t",
17 default='genomic', help='Type of sequence') 15 "--type",
18 parser.add_option('--expand_3prime', type='int', default=0, 16 type="choice",
19 help='Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type') 17 choices=["genomic", "cds", "cdna", "protein"],
20 parser.add_option('--expand_5prime', type='int', default=0, 18 default="genomic",
21 help='Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type') 19 help="Type of sequence",
20 )
21 parser.add_option(
22 "--expand_3prime",
23 type="int",
24 default=0,
25 help="Expand the sequence downstream of the sequence by this many basepairs. Only available when using genomic sequence type",
26 )
27 parser.add_option(
28 "--expand_5prime",
29 type="int",
30 default=0,
31 help="Expand the sequence upstream of the sequence by this many basepairs. Only available when using genomic sequence type",
32 )
22 options, args = parser.parse_args() 33 options, args = parser.parse_args()
23 if options.input is None: 34 if options.input is None:
24 raise Exception('-i option must be specified') 35 raise Exception("-i option must be specified")
25 36
26 server = 'https://rest.ensembl.org' 37 server = "https://rest.ensembl.org"
27 ext = 'sequence/id' 38 ext = "sequence/id"
28 39
29 headers = {'Content-Type': 'text/x-fasta', 'Accept': 'text/x-fasta'} 40 headers = {"Content-Type": "text/x-fasta", "Accept": "text/x-fasta"}
30 params = dict((k, getattr(options, k)) for k in ['type', 'expand_3prime', 'expand_5prime']) 41 params = {
42 k: getattr(options, k) for k in ("type", "expand_3prime", "expand_5prime")
43 }
31 with open(options.input) as f: 44 with open(options.input) as f:
32 # Need to split the file in chunks of 50 lines because of the limit imposed by Ensembl 45 # Need to split the file in chunks of 50 lines because of the limit imposed by Ensembl
33 while True: 46 while True:
34 ids = [line.strip() for line in islice(f, 50)] 47 ids = [line.strip() for line in islice(f, 50)]
35 if not ids: 48 if not ids:
36 break 49 break
37 data = {'ids': ids} 50 data = {"ids": ids}
38 r = requests.post(urljoin(server, ext), params=params, headers=headers, 51 r = requests.post(
39 data=json.dumps(data), allow_redirects=False) 52 urljoin(server, ext),
53 params=params,
54 headers=headers,
55 data=json.dumps(data),
56 allow_redirects=False,
57 )
40 58
41 if not r.ok: 59 if not r.ok:
42 r.raise_for_status() 60 r.raise_for_status()
43 61
44 print(r.text) 62 print(r.text)