Mercurial > repos > iuc > ebi_search_rest_results
comparison ebeye_urllib.py @ 0:bb7989bd88ba draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ebi_tools commit 7a9c88c1c80b80aaa63e55e9d9125b6a4dd695ac
author | iuc |
---|---|
date | Thu, 01 Dec 2016 15:28:20 -0500 |
parents | |
children | b6029f2c71cb |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bb7989bd88ba |
---|---|
1 #!/usr/bin/env python | |
2 # ====================================================================== | |
3 # Script derived from the EB-eye (REST) Python client available at | |
4 # http://www.ebi.ac.uk/Tools/webservices/services/eb-eye_rest | |
5 # and distributed under the Apache License | |
6 # ====================================================================== | |
7 # Load libraries | |
8 import platform | |
9 import os | |
10 import urllib | |
11 import re | |
12 from optparse import OptionParser | |
13 from gzip import GzipFile | |
14 from xmltramp2 import xmltramp | |
15 # python2 | |
16 from StringIO import StringIO | |
17 import urllib2 | |
18 # python3 | |
19 # import urllib.request as urllib2 | |
20 | |
21 | |
22 # Service base URL | |
23 baseUrl = 'http://www.ebi.ac.uk/ebisearch/ws/rest' | |
24 | |
25 # Debug level | |
26 debugLevel = 0 | |
27 | |
28 | |
29 # Debug print | |
30 def printDebugMessage(functionName, message, level): | |
31 if(level <= debugLevel): | |
32 print ('[' + functionName + '] ' + message) | |
33 | |
34 | |
35 # User-agent for request. | |
36 def getUserAgent(): | |
37 printDebugMessage('getUserAgent', 'Begin', 11) | |
38 urllib_agent = 'Python-urllib/%s' % urllib2.__version__ | |
39 clientRevision = '$Revision: 2468 $' | |
40 clientVersion = '0' | |
41 if len(clientRevision) > 11: | |
42 clientVersion = clientRevision[11:-2] | |
43 user_agent = 'EBI-Sample-Client/%s (%s; Python %s; %s) %s' % ( | |
44 clientVersion, os.path.basename(__file__), | |
45 platform.python_version(), platform.system(), | |
46 urllib_agent | |
47 ) | |
48 printDebugMessage('getUserAgent', 'user_agent: ' + user_agent, 12) | |
49 printDebugMessage('getUserAgent', 'End', 11) | |
50 return user_agent | |
51 | |
52 | |
53 # Wrapper for a REST (HTTP GET) request | |
54 def restRequest(url): | |
55 printDebugMessage('restRequest', 'Begin', 11) | |
56 printDebugMessage('restRequest', 'url: ' + url, 11) | |
57 # python 2 | |
58 url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") | |
59 # python 3 | |
60 # url = urllib.request.quote(url, safe="%/:=&?~#+!$,;'@()*[]") | |
61 | |
62 try: | |
63 user_agent = getUserAgent() | |
64 http_headers = { | |
65 'User-Agent': user_agent, | |
66 'Accept-Encoding': 'gzip' | |
67 } | |
68 req = urllib2.Request(url, None, http_headers) | |
69 resp = urllib2.urlopen(req) | |
70 # python2 | |
71 encoding = resp.info().getheader('Content-Encoding') | |
72 # python3 | |
73 # encoding = resp.info().__getitem__('Content-Encoding') | |
74 result = None | |
75 if encoding is None or encoding == 'identity': | |
76 # python2 | |
77 result = resp.read() | |
78 # python3 | |
79 # result = str(resp.read(), 'utf-8') | |
80 elif encoding == 'gzip': | |
81 result = resp.read() | |
82 printDebugMessage('restRequest', 'result: ' + str(result), 21) | |
83 # python2 | |
84 gz = GzipFile( | |
85 fileobj=StringIO(result), | |
86 mode="r") | |
87 result = gz.read() | |
88 # python3 | |
89 # result = str(gzip.decompress(result), 'utf-8') | |
90 else: | |
91 raise Exception('Unsupported Content-Encoding') | |
92 resp.close() | |
93 except urllib2.HTTPError as ex: | |
94 raise ex | |
95 printDebugMessage('restRequest', 'result: ' + result, 11) | |
96 printDebugMessage('restRequest', 'End', 11) | |
97 return result | |
98 | |
99 | |
100 def hasSubdomains(domainInfo): | |
101 for dir in domainInfo._dir: | |
102 if dir._name == 'subdomains': | |
103 return True | |
104 return False | |
105 | |
106 | |
107 def extractUsefulFields(fieldInfos): | |
108 searchable = [] | |
109 retrievable = [] | |
110 | |
111 for fieldInfo in fieldInfos: | |
112 if fieldInfo('id') == "$facets": | |
113 continue | |
114 | |
115 options = fieldInfo['options']['option':] | |
116 for option in options: | |
117 if option("name") == "searchable" and str(option) == "true": | |
118 searchable.append(fieldInfo('id')) | |
119 if option("name") == "retrievable" and str(option) == "true": | |
120 retrievable.append(fieldInfo('id')) | |
121 return searchable, retrievable | |
122 | |
123 | |
124 def extractLowerLevelDomains(domainInfo, domains): | |
125 if hasSubdomains(domainInfo): | |
126 subdomains = domainInfo['subdomains']['domain':] | |
127 for subdomain in subdomains: | |
128 domains = extractLowerLevelDomains( subdomain, domains) | |
129 else: | |
130 searchable, retrievable = extractUsefulFields( | |
131 domainInfo['fieldInfos']['fieldInfo':]) | |
132 | |
133 domain_id = domainInfo('id') | |
134 domains.setdefault(domain_id, {}) | |
135 domains[domain_id]["name"] = domainInfo('name') | |
136 domains[domain_id]["searchable_fields"] = sorted(searchable) | |
137 domains[domain_id]["retrievable_fields"] = sorted(retrievable) | |
138 return domains | |
139 | |
140 | |
141 # Get domain Hierarchy | |
142 def getDomainHierarchy(): | |
143 requestUrl = baseUrl + '/allebi' | |
144 xmlDoc = restRequest(requestUrl) | |
145 doc = xmltramp.parse(xmlDoc) | |
146 allebi = doc['domains']['domain'] | |
147 lower_level_domains = extractLowerLevelDomains(allebi, {}) | |
148 printDebugMessage('getDomainHierarchy', 'End', 1) | |
149 return lower_level_domains | |
150 | |
151 | |
152 # Check if a databaseInfo matches a database name. | |
153 def is_database(dbInfo, dbName): | |
154 printDebugMessage('is_database', 'Begin', 11) | |
155 retVal = False | |
156 if str(dbInfo.name) == dbName: | |
157 retVal = True | |
158 else: | |
159 for dbAlias in dbInfo.aliasList: | |
160 if str(dbAlias) == dbName: | |
161 retVal = True | |
162 printDebugMessage('is_database', 'retVal: ' + str(retVal), 11) | |
163 printDebugMessage('is_database', 'End', 11) | |
164 return retVal | |
165 | |
166 | |
167 # Get number of results | |
168 def getNumberOfResults(domain, query): | |
169 printDebugMessage('getNumberOfResults', 'Begin', 1) | |
170 requestUrl = baseUrl + '/' + domain + '?query=' + query + '&size=0' | |
171 printDebugMessage('getNumberOfResults', requestUrl, 2) | |
172 xmlDoc = restRequest(requestUrl) | |
173 doc = xmltramp.parse(xmlDoc) | |
174 numberOfResults = int(str(doc['hitCount'])) | |
175 printDebugMessage('getNumberOfResults', 'End', 1) | |
176 return numberOfResults | |
177 | |
178 | |
179 def makeRequest(requestUrl): | |
180 xmlDoc = restRequest(requestUrl) | |
181 doc = xmltramp.parse(xmlDoc) | |
182 entries = doc['entries']['entry':] | |
183 formatted_output = printEntries(entries) | |
184 return formatted_output | |
185 | |
186 | |
187 # Get search results | |
188 def getResults(domain, query, fields): | |
189 numberOfResults = getNumberOfResults(domain, query) | |
190 maximum_size = 100 | |
191 quotient = numberOfResults / maximum_size | |
192 start = 0 | |
193 | |
194 printDebugMessage('getResults', 'Begin', 1) | |
195 request_output = "%s\tlink\n" % (fields.replace(",", "\t")) | |
196 for i in range(quotient): | |
197 start = maximum_size * i | |
198 requestUrl = baseUrl + '/' + domain + '?query=' + query | |
199 requestUrl += '&fields=' + fields + '&size=' + str(maximum_size) | |
200 requestUrl += '&start=' + str(start) + '&fieldurl=true' | |
201 request_output += makeRequest(requestUrl) | |
202 | |
203 if (numberOfResults % 100) > 0: | |
204 start = maximum_size * quotient | |
205 remainder = numberOfResults - start | |
206 requestUrl = baseUrl + '/' + domain + '?query=' + query | |
207 requestUrl += '&fields=' + fields + '&size=' + str(remainder) | |
208 requestUrl += '&start=' + str(start) + '&fieldurl=true' | |
209 request_output += makeRequest(requestUrl) | |
210 | |
211 print(request_output) | |
212 | |
213 | |
214 def printEntries(entries): | |
215 output = "" | |
216 printDebugMessage('printEntries', 'Begin', 1) | |
217 for entry in entries: | |
218 sep = "" | |
219 for field in entry['fields']['field':]: | |
220 output += "%s" % (sep) | |
221 fields = field['values']['value':] | |
222 if len(fields) > 0: | |
223 sub_sep = "" | |
224 for value in field['values']['value':]: | |
225 output += "%s%s" % (sub_sep, value) | |
226 sub_sep = "," | |
227 sep = "\t" | |
228 | |
229 if hasFieldUrls(entry): | |
230 output += "%s" % (sep) | |
231 sub_sep = "" | |
232 for fieldurl in entry['fieldURLs']['fieldURL':]: | |
233 output += "%s%s" % (sub_sep, str(fieldurl)) | |
234 sub_sep = "," | |
235 sep = "\t" | |
236 if hasViewUrls(entry): | |
237 output += "%s" % (sep) | |
238 sub_sep = "" | |
239 for viewurl in entry['viewURLs']['viewURL':]: | |
240 output += "%s%s" % (sub_sep, str(viewurl)) | |
241 sub_sep = "," | |
242 output += "\n" | |
243 printDebugMessage('printEntries', 'End', 1) | |
244 return output | |
245 | |
246 | |
247 def hasFieldUrls(entry): | |
248 for dir in entry._dir: | |
249 if dir._name == 'fieldURLs': | |
250 return True | |
251 return False | |
252 | |
253 | |
254 def hasViewUrls(entry): | |
255 for dir in entry._dir: | |
256 if dir._name == 'viewURLs': | |
257 return True | |
258 return False | |
259 | |
260 | |
261 def getRunLink(run_id): | |
262 printDebugMessage('getEntries', 'Begin', 1) | |
263 requestUrl = baseUrl + '/metagenomics_runs/entry/' + run_id + '?fieldurl=true' | |
264 printDebugMessage('getEntries', requestUrl, 2) | |
265 xmlDoc = restRequest(requestUrl) | |
266 doc = xmltramp.parse(xmlDoc) | |
267 entries = doc['entries']['entry':] | |
268 fieldURL = '' | |
269 for entry in entries: | |
270 for fieldurl in entry['fieldURLs']['fieldURL':]: | |
271 fieldURL += str(fieldurl) | |
272 printDebugMessage('getEntries', 'End', 1) | |
273 p = re.compile('http') | |
274 fieldURL = p.sub('https', fieldURL) | |
275 print fieldURL | |
276 | |
277 | |
278 if __name__ == '__main__': | |
279 # Usage message | |
280 usage = """ | |
281 %prog getDomainHierarchy | |
282 %prog getResults <domain> <query> <fields> | |
283 %prog getRunLink <runId> | |
284 """ | |
285 | |
286 description = "Tools to query and download data from several EMBL-EBI databases" | |
287 description += "The searching tools are using the EB-eye search engine. " | |
288 description += "http://www.ebi.ac.uk/ebisearch/" | |
289 # Process command-line options | |
290 parser = OptionParser( | |
291 usage=usage, | |
292 description=description, | |
293 version='1.0') | |
294 (options, args) = parser.parse_args() | |
295 | |
296 # No arguments, print usage | |
297 if len(args) < 1: | |
298 parser.print_help() | |
299 | |
300 # Get domain hierarchy | |
301 elif args[0] == 'getDomainHierarchy': | |
302 getDomainHierarchy() | |
303 | |
304 # Get search results | |
305 elif args[0] == 'getResults': | |
306 if len(args) < 4: | |
307 print ('domain, query and fields should be given.') | |
308 else: | |
309 getResults(args[1], args[2], args[3]) | |
310 | |
311 # Get run link results | |
312 elif args[0] == 'getRunLink': | |
313 if len(args) < 2: | |
314 print ('run id should be given.') | |
315 else: | |
316 getRunLink(args[1]) | |
317 | |
318 # Unknown argument combination, display usage | |
319 else: | |
320 print ('Error: unrecognised argument combination') | |
321 parser.print_help() |