comparison ensembl_rest.py @ 0:da1b538b87e5 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
author galaxyp
date Mon, 22 Jan 2018 13:13:47 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:da1b538b87e5
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2017, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 from __future__ import print_function
16 from __future__ import unicode_literals
17
18 import sys
19
20 from time import sleep
21
22 import requests
23
24
25 server = "https://rest.ensembl.org"
26 ext = "/info/assembly/homo_sapiens?"
27 max_region = 4000000
28 debug = False
29
30
31 def ensembl_rest(ext, headers):
32 if debug:
33 print("%s" % ext, file=sys.stderr)
34 r = requests.get(server+ext, headers=headers)
35 if r.status_code == 429:
36 print("response headers: %s\n" % r.headers, file=sys.stderr)
37 if 'Retry-After' in r.headers:
38 sleep(r.headers['Retry-After'])
39 r = requests.get(server+ext, headers=headers)
40 if not r.ok:
41 r.raise_for_status()
42 return r
43
44
45 def get_species():
46 results = dict()
47 ext = "/info/species"
48 req_header = {"Content-Type": "application/json"}
49 r = ensembl_rest(ext, req_header)
50 for species in r.json()['species']:
51 results[species['name']] = species
52 print("%s\t%s\t%s\t%s\t%s" %
53 (species['name'], species['common_name'],
54 species['display_name'],
55 species['strain'],
56 species['taxon_id']), file=sys.stdout)
57 return results
58
59
60 def get_biotypes(species):
61 biotypes = []
62 ext = "/info/biotypes/%s?" % species
63 req_header = {"Content-Type": "application/json"}
64 r = ensembl_rest(ext, req_header)
65 for entry in r.json():
66 if 'biotype' in entry:
67 biotypes.append(entry['biotype'])
68 return biotypes
69
70
71 def get_toplevel(species):
72 coord_systems = dict()
73 ext = "/info/assembly/%s?" % species
74 req_header = {"Content-Type": "application/json"}
75 r = ensembl_rest(ext, req_header)
76 toplevel = r.json()
77 for seq in toplevel['top_level_region']:
78 if seq['coord_system'] not in coord_systems:
79 coord_systems[seq['coord_system']] = dict()
80 coord_system = coord_systems[seq['coord_system']]
81 coord_system[seq['name']] = int(seq['length'])
82 return coord_systems
83
84
85 def get_transcripts_bed(species, refseq, start, length, strand='',
86 params=None):
87 bed = []
88 param = params if params else ''
89 req_header = {"Content-Type": "text/x-bed"}
90 regions = list(range(start, length, max_region))
91 if not regions or regions[-1] < length:
92 regions.append(length)
93 for end in regions[1:]:
94 ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\
95 % (species, refseq, start, end, strand, param)
96 start = end + 1
97 r = ensembl_rest(ext, req_header)
98 if r.text:
99 bed += r.text.splitlines()
100 return bed
101
102
103 def get_seq(id, seqtype, params=None):
104 param = params if params else ''
105 ext = "/sequence/id/%s?type=%s;%s" % (id, seqtype, param)
106 req_header = {"Content-Type": "text/plain"}
107 r = ensembl_rest(ext, req_header)
108 return r.text
109
110
111 def get_cdna(id, params=None):
112 return get_seq(id, 'cdna', params=params)
113
114
115 def get_cds(id, params=None):
116 return get_seq(id, 'cds', params=params)
117
118
119 def get_genomic(id, params=None):
120 return get_seq(id, 'genomic', params=params)
121
122
123 def get_transcript_haplotypes(species, transcript):
124 ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1"\
125 % (species, transcript)
126 req_header = {"Content-Type": "application/json"}
127 r = ensembl_rest(ext, req_header)
128 decoded = r.json()
129 return decoded