comparison eutils.py @ 3:254f40d3ae2b draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_entrez_eutils commit dae34e5e182b4cceb808d7353080f14aa9a78ca9"
author iuc
date Wed, 23 Sep 2020 09:50:11 +0000
parents cb5a0fe9e036
children 7fe990069458
comparison
equal deleted inserted replaced
2:cb5a0fe9e036 3:254f40d3ae2b
10 10
11 class Client(object): 11 class Client(object):
12 12
13 def __init__(self, history_file=None, user_email=None, admin_email=None): 13 def __init__(self, history_file=None, user_email=None, admin_email=None):
14 self.using_history = False 14 self.using_history = False
15 self.using_parsedids = False
15 16
16 if user_email is not None and admin_email is not None: 17 if user_email is not None and admin_email is not None:
17 Entrez.email = ';'.join((admin_email, user_email)) 18 Entrez.email = ';'.join((admin_email, user_email))
18 elif user_email is not None: 19 elif user_email is not None:
19 Entrez.email = user_email 20 Entrez.email = user_email
27 "administrator email in NCBI_EUTILS_CONTACT") 28 "administrator email in NCBI_EUTILS_CONTACT")
28 29
29 if history_file is not None: 30 if history_file is not None:
30 with open(history_file, 'r') as handle: 31 with open(history_file, 'r') as handle:
31 data = json.loads(handle.read()) 32 data = json.loads(handle.read())
32 self.query_key = data['QueryKey'] 33 # esearch
33 self.webenv = data['WebEnv'] 34 if 'QueryKey' in data:
34 self.using_history = True 35 self.query_key = data['QueryKey']
36 self.webenv = data['WebEnv']
37 self.query_keys = []
38 self.query_keys += [data['QueryKey']]
39 self.using_history = True
40 elif 'query_key' in data:
41 self.query_key = data['query_key']
42 self.webenv = data['WebEnv']
43 self.query_keys = []
44 self.query_keys += [data['query_key']]
45 self.using_history = True
46 elif 'esearchresult' in data:
47 self.query_key = data['esearchresult']['querykey']
48 self.webenv = data['esearchresult']['webenv']
49 self.query_keys = []
50 self.query_keys += [data['esearchresult']['querykey']]
51 self.using_history = True
52 # elink
53 elif 'linksets' in data:
54 # elink for cmd=neighbor_history
55 if 'linksetdbhistories' in data['linksets'][0]:
56 self.webenv = data['linksets'][0]['webenv']
57 self.query_key = data['linksets'][0]['linksetdbhistories'][0]['querykey']
58 self.using_history = True
59 # elink for cmd=neighbor|neighbor_score
60 elif 'linksetdbs' in data['linksets'][0]:
61 self.using_parsedids = True
62 # elink for neighbor
63 if isinstance(data['linksets'][0]['linksetdbs'][0]['links'][0], str):
64 self.idstr = ','.join(data['linksets'][0]['linksetdbs'][0]['links'])
65 # elink for neighbor_score
66 else:
67 self.idstr = ','.join(map(lambda x: x['id'], data['linksets'][0]['linksetdbs'][0]['links']))
68 if 'linksetdbhistories' in data['linksets'][0]:
69 self.webenv = data['linksets'][0]['webenv']
70 self.query_keys = []
71 for query in data['linksets'][0]['linksetdbhistories']:
72 if 'querykey' in query:
73 self.query_keys += [query['querykey']]
74 else:
75 print("No match")
76 print(data)
35 77
36 def get_history(self): 78 def get_history(self):
37 if not self.using_history: 79 if self.using_history:
38 return {}
39 else:
40 return { 80 return {
41 'query_key': self.query_key, 81 'query_key': self.query_key,
42 'WebEnv': self.webenv, 82 'WebEnv': self.webenv,
43 } 83 }
84 elif self.using_parsedids:
85 return {
86 'id': self.idstr,
87 }
88 else:
89 return {}
90
91 def get_histories(self):
92 histories = []
93 for key in self.query_keys:
94 histories += [{'WebEnv': self.webenv, 'query_key': key}]
95 return histories
44 96
45 def post(self, database, **payload): 97 def post(self, database, **payload):
46 return json.dumps(Entrez.read(Entrez.epost(database, **payload)), indent=4) 98 return json.dumps(Entrez.read(Entrez.epost(database, **payload)), indent=4)
47 99
48 def fetch(self, db, ftype=None, **payload): 100 def fetch(self, db, ftype=None, **payload):
49 os.makedirs("downloads") 101 os.makedirs("downloads")
50 102
51 if 'id' in payload: 103 if 'id' in payload:
52 summary = self.id_summary(db, payload['id']) 104 summary = self.id_summary(db, payload['id'])
105 elif 'WebEnv' not in payload or 'query_key' not in payload:
106 summary = self.history_summary(db)
53 else: 107 else:
54 summary = self.history_summary(db) 108 summary = payload
55 109
56 count = len(summary) 110 count = len(summary)
57 payload['retmax'] = BATCH_SIZE 111 payload['retmax'] = BATCH_SIZE
58 112
59 # This may be bad. I'm not sure yet. I think it will be ... but UGH. 113 # This may be bad. I'm not sure yet. I think it will be ... but UGH.
85 return Entrez.esummary(**payload).read() 139 return Entrez.esummary(**payload).read()
86 140
87 def link(self, **payload): 141 def link(self, **payload):
88 return Entrez.elink(**payload).read() 142 return Entrez.elink(**payload).read()
89 143
90 def extract_history(self, xml_data): 144 def extract_history_from_xml_file(self, xml_file):
91 parsed_data = Entrez.read(StringIO.StringIO(xml_data))
92 history = {} 145 history = {}
93 for key in ('QueryKey', 'WebEnv'): 146 with open(xml_file, 'r') as handle:
94 if key in parsed_data: 147 xml_str = handle.read()
95 history[key] = parsed_data[key] 148 history = self.extract_history_from_xml(xml_str)
96
97 return history 149 return history
150
151 def extract_history_from_xml(self, xml_str):
152 try:
153 parsed_data = Entrez.read(StringIO(xml_str))
154 history = {}
155 gotit = 0
156
157 # New code doesn't work for esearch input to elink - Parsing esearch output (reading an xml history) does not work as an elink input payload, which needs 'QueryKey'. Notably, if parsing elink output as input to elink, conversion of xml 'QueryKey' to 'query_key' is needed for some reason. Also Notably, efetch returned results using the 'QueryKey' key
158 # For esearch xml history results
159 if 'QueryKey' in parsed_data:
160 history['query_key'] = parsed_data['QueryKey']
161 gotit += 1
162 if 'WebEnv' in parsed_data:
163 history['WebEnv'] = parsed_data['WebEnv']
164 gotit += 1
165 # For elink xml history results
166 if gotit < 2:
167 if 'LinkSetDbHistory' in parsed_data[0]:
168 if 'QueryKey' in parsed_data[0]['LinkSetDbHistory'][0]:
169 history['query_key'] = parsed_data[0]['LinkSetDbHistory'][0]['QueryKey']
170 gotit += 1
171 if 'WebEnv' in parsed_data[0]:
172 history['WebEnv'] = parsed_data[0]['WebEnv']
173 gotit += 1
174 if gotit < 2:
175 raise Exception("Could not find WebEnv in xml response")
176 except Exception as e:
177 print("Error parsing...")
178 print(xml_str)
179 raise(e)
180
181 return history
182
183 def extract_histories_from_xml_file(self, xml_file):
184 histories = []
185 with open(xml_file, 'r') as handle:
186 xml_str = handle.read()
187 histories = self.extract_histories_from_xml(xml_str)
188 return histories
189
190 def extract_histories_from_xml(self, xml_str):
191 try:
192 parsed_data = Entrez.read(StringIO(xml_str))
193 histories = []
194 gotit = 0
195
196 # New code doesn't work for esearch input to elink - Parsing esearch output (reading an xml history) does not work as an elink input payload, which needs 'QueryKey'. Notably, if parsing elink output as input to elink, conversion of xml 'QueryKey' to 'query_key' is needed for some reason. Also Notably, efetch returned results using the 'QueryKey' key
197 # For esearch xml history results
198 if 'QueryKey' in parsed_data:
199 tmp_hist = {}
200 tmp_hist['query_key'] = parsed_data['QueryKey']
201 gotit += 1
202 if 'WebEnv' in parsed_data:
203 tmp_hist['WebEnv'] = parsed_data['WebEnv']
204 gotit += 1
205 if gotit == 2:
206 histories += [tmp_hist]
207 # For elink xml history results
208 else:
209 gotenv = 0
210 if 'LinkSetDbHistory' in parsed_data[0]:
211 for query in parsed_data[0]['LinkSetDbHistory']:
212 tmp_hist = {}
213 if 'WebEnv' in parsed_data[0]:
214 tmp_hist['WebEnv'] = parsed_data[0]['WebEnv']
215 if 'QueryKey' in query:
216 tmp_hist['query_key'] = query['QueryKey']
217 histories += [tmp_hist]
218 gotit += 1
219 if gotit == 0 and gotenv == 0:
220 raise Exception("Could not find WebEnv in xml response")
221 except Exception as e:
222 print("Error parsing...")
223 print(xml_str)
224 raise(e)
225
226 return histories
98 227
99 def search(self, **payload): 228 def search(self, **payload):
100 return Entrez.esearch(**payload).read() 229 return Entrez.esearch(**payload).read()
101 230
102 def info(self, **kwargs): 231 def info(self, **kwargs):
107 236
108 def citmatch(self, **kwargs): 237 def citmatch(self, **kwargs):
109 return Entrez.ecitmatch(**kwargs).read() 238 return Entrez.ecitmatch(**kwargs).read()
110 239
111 @classmethod 240 @classmethod
112 def parse_ids(cls, id_list, id, history_file): 241 def jsonstring2jsondata(cls, json_str):
242 json_handle = StringIO(json_str)
243 json_data = json.loads(json_handle.read())
244 return json_data
245
246 @classmethod
247 def jsonfile2UIlist(cls, json_file):
248 merged_ids = []
249 with open(json_file, 'r') as handle:
250 json_data = json.loads(handle.read())
251 for id in cls.jsondata2UIlist(json_data):
252 merged_ids += [id]
253 return merged_ids
254
255 @classmethod
256 def jsondata2UIlist(cls, json_data):
257 merged_ids = []
258
259 # Always prioritize the result links as opposed to the search links
260 # elink - retrieves linked IDs for cmd=neighbor|neighbor_score only
261 if 'linksets' in json_data:
262 for lnk in json_data['linksets'][0]['linksetdbs']:
263 if 'links' in lnk:
264 for id in lnk['links']:
265 # elink for neighbor
266 if isinstance(id, str):
267 merged_ids.append(id)
268 # elink for neighbor_score
269 else:
270 merged_ids.append(id['id'])
271 # esearch
272 elif 'esearchresult' in json_data:
273 for id in json_data['esearchresult']['idlist']:
274 merged_ids += [id]
275
276 return merged_ids
277
278 @classmethod
279 def xmlfile2UIlist(cls, xml_file):
280 merged_ids = []
281 with open(xml_file, 'r') as handle:
282 xml_data = Entrez.read(handle)
283 for id in cls.xmldata2UIlist(xml_data):
284 merged_ids += [id]
285 return merged_ids
286
287 @classmethod
288 def xmlstring2UIlist(cls, xml_str):
289 merged_ids = []
290 xml_data = Entrez.read(StringIO(xml_str))
291 for id in cls.xmldata2UIlist(xml_data):
292 merged_ids += [id]
293 return merged_ids
294
295 @classmethod
296 def xmldata2UIlist(cls, xml_data):
297 merged_ids = []
298
299 try:
300 # Always prioritize the result links as opposed to the search links
301 # elink - retrieves linked IDs for cmd=neighbor|neighbor_score only
302 if 'LinkSetDb' in xml_data[0]:
303 for lnk in xml_data[0]['LinkSetDb'][0]['Link']:
304 # elink for neighbor
305 if isinstance(lnk, str):
306 merged_ids.append(lnk)
307 # elink for neighbor_score
308 else:
309 merged_ids.append(lnk['Id'])
310 # esearch
311 elif 'IdList' in xml_data:
312 for id in xml_data['IdList']:
313 merged_ids += [id]
314 # If it was not elink output, we will end up here
315 except Exception:
316 # esearch
317 if 'IdList' in xml_data:
318 for id in xml_data['IdList']:
319 merged_ids += [id]
320
321 return merged_ids
322
323 @classmethod
324 def parse_ids(cls, id_list, id, history_file, xml_file, json_file):
113 """Parse IDs passed on --cli or in a file passed to the cli 325 """Parse IDs passed on --cli or in a file passed to the cli
114 """ 326 """
115 merged_ids = [] 327 merged_ids = []
116 if id is not None: 328 if id is not None:
117 for pid in id.replace('__cn__', ',').replace('\n', ',').split(','): 329 for pid in id.replace('__cn__', ',').replace('\n', ',').split(','):
120 332
121 if id_list is not None: 333 if id_list is not None:
122 with open(id_list, 'r') as handle: 334 with open(id_list, 'r') as handle:
123 merged_ids += [x.strip() for x in handle.readlines()] 335 merged_ids += [x.strip() for x in handle.readlines()]
124 336
125 # Exception hanlded here for uniformity 337 if xml_file is not None:
126 if len(merged_ids) == 0 and history_file is None: 338 tmp_ids = cls.xmlfile2UIlist(xml_file)
127 raise Exception("Must provide history file or IDs") 339 for id in tmp_ids:
128 340 merged_ids += [id]
129 return merged_ids 341
342 if json_file is not None:
343 tmp_ids = cls.jsonfile2UIlist(json_file)
344 for id in tmp_ids:
345 merged_ids += [id]
346
347 return merged_ids
348
349 @classmethod
350 def getVersion(cls):
351 """Return the biopython version
352 """
353 import Bio
354 return Bio.__version__