comparison ebeye_urllib.py @ 0:bb7989bd88ba draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ebi_tools commit 7a9c88c1c80b80aaa63e55e9d9125b6a4dd695ac
author iuc
date Thu, 01 Dec 2016 15:28:20 -0500
parents
children b6029f2c71cb
comparison
equal deleted inserted replaced
-1:000000000000 0:bb7989bd88ba
1 #!/usr/bin/env python
2 # ======================================================================
3 # Script derived from the EB-eye (REST) Python client available at
4 # http://www.ebi.ac.uk/Tools/webservices/services/eb-eye_rest
5 # and distributed under the Apache License
6 # ======================================================================
7 # Load libraries
8 import platform
9 import os
10 import urllib
11 import re
12 from optparse import OptionParser
13 from gzip import GzipFile
14 from xmltramp2 import xmltramp
15 # python2
16 from StringIO import StringIO
17 import urllib2
18 # python3
19 # import urllib.request as urllib2
20
21
22 # Service base URL
23 baseUrl = 'http://www.ebi.ac.uk/ebisearch/ws/rest'
24
25 # Debug level
26 debugLevel = 0
27
28
29 # Debug print
30 def printDebugMessage(functionName, message, level):
31 if(level <= debugLevel):
32 print ('[' + functionName + '] ' + message)
33
34
35 # User-agent for request.
36 def getUserAgent():
37 printDebugMessage('getUserAgent', 'Begin', 11)
38 urllib_agent = 'Python-urllib/%s' % urllib2.__version__
39 clientRevision = '$Revision: 2468 $'
40 clientVersion = '0'
41 if len(clientRevision) > 11:
42 clientVersion = clientRevision[11:-2]
43 user_agent = 'EBI-Sample-Client/%s (%s; Python %s; %s) %s' % (
44 clientVersion, os.path.basename(__file__),
45 platform.python_version(), platform.system(),
46 urllib_agent
47 )
48 printDebugMessage('getUserAgent', 'user_agent: ' + user_agent, 12)
49 printDebugMessage('getUserAgent', 'End', 11)
50 return user_agent
51
52
53 # Wrapper for a REST (HTTP GET) request
54 def restRequest(url):
55 printDebugMessage('restRequest', 'Begin', 11)
56 printDebugMessage('restRequest', 'url: ' + url, 11)
57 # python 2
58 url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
59 # python 3
60 # url = urllib.request.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
61
62 try:
63 user_agent = getUserAgent()
64 http_headers = {
65 'User-Agent': user_agent,
66 'Accept-Encoding': 'gzip'
67 }
68 req = urllib2.Request(url, None, http_headers)
69 resp = urllib2.urlopen(req)
70 # python2
71 encoding = resp.info().getheader('Content-Encoding')
72 # python3
73 # encoding = resp.info().__getitem__('Content-Encoding')
74 result = None
75 if encoding is None or encoding == 'identity':
76 # python2
77 result = resp.read()
78 # python3
79 # result = str(resp.read(), 'utf-8')
80 elif encoding == 'gzip':
81 result = resp.read()
82 printDebugMessage('restRequest', 'result: ' + str(result), 21)
83 # python2
84 gz = GzipFile(
85 fileobj=StringIO(result),
86 mode="r")
87 result = gz.read()
88 # python3
89 # result = str(gzip.decompress(result), 'utf-8')
90 else:
91 raise Exception('Unsupported Content-Encoding')
92 resp.close()
93 except urllib2.HTTPError as ex:
94 raise ex
95 printDebugMessage('restRequest', 'result: ' + result, 11)
96 printDebugMessage('restRequest', 'End', 11)
97 return result
98
99
100 def hasSubdomains(domainInfo):
101 for dir in domainInfo._dir:
102 if dir._name == 'subdomains':
103 return True
104 return False
105
106
107 def extractUsefulFields(fieldInfos):
108 searchable = []
109 retrievable = []
110
111 for fieldInfo in fieldInfos:
112 if fieldInfo('id') == "$facets":
113 continue
114
115 options = fieldInfo['options']['option':]
116 for option in options:
117 if option("name") == "searchable" and str(option) == "true":
118 searchable.append(fieldInfo('id'))
119 if option("name") == "retrievable" and str(option) == "true":
120 retrievable.append(fieldInfo('id'))
121 return searchable, retrievable
122
123
124 def extractLowerLevelDomains(domainInfo, domains):
125 if hasSubdomains(domainInfo):
126 subdomains = domainInfo['subdomains']['domain':]
127 for subdomain in subdomains:
128 domains = extractLowerLevelDomains( subdomain, domains)
129 else:
130 searchable, retrievable = extractUsefulFields(
131 domainInfo['fieldInfos']['fieldInfo':])
132
133 domain_id = domainInfo('id')
134 domains.setdefault(domain_id, {})
135 domains[domain_id]["name"] = domainInfo('name')
136 domains[domain_id]["searchable_fields"] = sorted(searchable)
137 domains[domain_id]["retrievable_fields"] = sorted(retrievable)
138 return domains
139
140
141 # Get domain Hierarchy
142 def getDomainHierarchy():
143 requestUrl = baseUrl + '/allebi'
144 xmlDoc = restRequest(requestUrl)
145 doc = xmltramp.parse(xmlDoc)
146 allebi = doc['domains']['domain']
147 lower_level_domains = extractLowerLevelDomains(allebi, {})
148 printDebugMessage('getDomainHierarchy', 'End', 1)
149 return lower_level_domains
150
151
152 # Check if a databaseInfo matches a database name.
153 def is_database(dbInfo, dbName):
154 printDebugMessage('is_database', 'Begin', 11)
155 retVal = False
156 if str(dbInfo.name) == dbName:
157 retVal = True
158 else:
159 for dbAlias in dbInfo.aliasList:
160 if str(dbAlias) == dbName:
161 retVal = True
162 printDebugMessage('is_database', 'retVal: ' + str(retVal), 11)
163 printDebugMessage('is_database', 'End', 11)
164 return retVal
165
166
167 # Get number of results
168 def getNumberOfResults(domain, query):
169 printDebugMessage('getNumberOfResults', 'Begin', 1)
170 requestUrl = baseUrl + '/' + domain + '?query=' + query + '&size=0'
171 printDebugMessage('getNumberOfResults', requestUrl, 2)
172 xmlDoc = restRequest(requestUrl)
173 doc = xmltramp.parse(xmlDoc)
174 numberOfResults = int(str(doc['hitCount']))
175 printDebugMessage('getNumberOfResults', 'End', 1)
176 return numberOfResults
177
178
179 def makeRequest(requestUrl):
180 xmlDoc = restRequest(requestUrl)
181 doc = xmltramp.parse(xmlDoc)
182 entries = doc['entries']['entry':]
183 formatted_output = printEntries(entries)
184 return formatted_output
185
186
187 # Get search results
188 def getResults(domain, query, fields):
189 numberOfResults = getNumberOfResults(domain, query)
190 maximum_size = 100
191 quotient = numberOfResults / maximum_size
192 start = 0
193
194 printDebugMessage('getResults', 'Begin', 1)
195 request_output = "%s\tlink\n" % (fields.replace(",", "\t"))
196 for i in range(quotient):
197 start = maximum_size * i
198 requestUrl = baseUrl + '/' + domain + '?query=' + query
199 requestUrl += '&fields=' + fields + '&size=' + str(maximum_size)
200 requestUrl += '&start=' + str(start) + '&fieldurl=true'
201 request_output += makeRequest(requestUrl)
202
203 if (numberOfResults % 100) > 0:
204 start = maximum_size * quotient
205 remainder = numberOfResults - start
206 requestUrl = baseUrl + '/' + domain + '?query=' + query
207 requestUrl += '&fields=' + fields + '&size=' + str(remainder)
208 requestUrl += '&start=' + str(start) + '&fieldurl=true'
209 request_output += makeRequest(requestUrl)
210
211 print(request_output)
212
213
214 def printEntries(entries):
215 output = ""
216 printDebugMessage('printEntries', 'Begin', 1)
217 for entry in entries:
218 sep = ""
219 for field in entry['fields']['field':]:
220 output += "%s" % (sep)
221 fields = field['values']['value':]
222 if len(fields) > 0:
223 sub_sep = ""
224 for value in field['values']['value':]:
225 output += "%s%s" % (sub_sep, value)
226 sub_sep = ","
227 sep = "\t"
228
229 if hasFieldUrls(entry):
230 output += "%s" % (sep)
231 sub_sep = ""
232 for fieldurl in entry['fieldURLs']['fieldURL':]:
233 output += "%s%s" % (sub_sep, str(fieldurl))
234 sub_sep = ","
235 sep = "\t"
236 if hasViewUrls(entry):
237 output += "%s" % (sep)
238 sub_sep = ""
239 for viewurl in entry['viewURLs']['viewURL':]:
240 output += "%s%s" % (sub_sep, str(viewurl))
241 sub_sep = ","
242 output += "\n"
243 printDebugMessage('printEntries', 'End', 1)
244 return output
245
246
247 def hasFieldUrls(entry):
248 for dir in entry._dir:
249 if dir._name == 'fieldURLs':
250 return True
251 return False
252
253
254 def hasViewUrls(entry):
255 for dir in entry._dir:
256 if dir._name == 'viewURLs':
257 return True
258 return False
259
260
261 def getRunLink(run_id):
262 printDebugMessage('getEntries', 'Begin', 1)
263 requestUrl = baseUrl + '/metagenomics_runs/entry/' + run_id + '?fieldurl=true'
264 printDebugMessage('getEntries', requestUrl, 2)
265 xmlDoc = restRequest(requestUrl)
266 doc = xmltramp.parse(xmlDoc)
267 entries = doc['entries']['entry':]
268 fieldURL = ''
269 for entry in entries:
270 for fieldurl in entry['fieldURLs']['fieldURL':]:
271 fieldURL += str(fieldurl)
272 printDebugMessage('getEntries', 'End', 1)
273 p = re.compile('http')
274 fieldURL = p.sub('https', fieldURL)
275 print fieldURL
276
277
278 if __name__ == '__main__':
279 # Usage message
280 usage = """
281 %prog getDomainHierarchy
282 %prog getResults <domain> <query> <fields>
283 %prog getRunLink <runId>
284 """
285
286 description = "Tools to query and download data from several EMBL-EBI databases"
287 description += "The searching tools are using the EB-eye search engine. "
288 description += "http://www.ebi.ac.uk/ebisearch/"
289 # Process command-line options
290 parser = OptionParser(
291 usage=usage,
292 description=description,
293 version='1.0')
294 (options, args) = parser.parse_args()
295
296 # No arguments, print usage
297 if len(args) < 1:
298 parser.print_help()
299
300 # Get domain hierarchy
301 elif args[0] == 'getDomainHierarchy':
302 getDomainHierarchy()
303
304 # Get search results
305 elif args[0] == 'getResults':
306 if len(args) < 4:
307 print ('domain, query and fields should be given.')
308 else:
309 getResults(args[1], args[2], args[3])
310
311 # Get run link results
312 elif args[0] == 'getRunLink':
313 if len(args) < 2:
314 print ('run id should be given.')
315 else:
316 getRunLink(args[1])
317
318 # Unknown argument combination, display usage
319 else:
320 print ('Error: unrecognised argument combination')
321 parser.print_help()