comparison fetch_fasta_from_NCBI.py @ 5:706fe8139955 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit b5ef783237b244d684e26b1ed1cc333a8305ce3e"
author artbio
date Tue, 16 Mar 2021 23:26:58 +0000
parents c667d0ee39f5
children 4af77e1af12a
comparison
equal deleted inserted replaced
4:c667d0ee39f5 5:706fe8139955
20 20
21 queries are 1 sec delayed, to satisfy NCBI guidelines 21 queries are 1 sec delayed, to satisfy NCBI guidelines
22 (more than what they request) 22 (more than what they request)
23 """ 23 """
24 import argparse 24 import argparse
25 import httplib 25 import http.client
26 import logging 26 import logging
27 import re 27 import re
28 import sys 28 import sys
29 import time 29 import time
30 import urllib 30 from urllib import error, parse, request
31 import urllib2 31
32
33 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
34 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
35 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
32 36
33 37
34 class QueryException(Exception): 38 class QueryException(Exception):
35 pass 39 pass
36 40
43 """ 47 """
44 self.logger = logger 48 self.logger = logger
45 self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" 49 self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
46 self.query_string = options.query_string 50 self.query_string = options.query_string
47 self.dbname = options.dbname 51 self.dbname = options.dbname
48 if options.outname: 52 if options.get_fasta:
49 self.outname = options.outname 53 self.get_fasta = options.get_fasta
50 else:
51 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
52 self.ids = [] 54 self.ids = []
53 self.retmax_esearch = 100000 55 self.retmax_esearch = 100000
54 self.retmax_efetch = 500 56 self.retmax_efetch = 500
55 self.count = 0 57 self.webenv = ''
56 self.webenv = "" 58 self.usehistory = ''
57 self.query_key = "" 59 self.query_key = ''
58 if options.get_uids: 60 self.iuds_file = options.iuds_file
59 self.get_uids = True 61 if self.iuds_file:
62 with open(self.iuds_file, 'r') as f:
63 for line in f:
64 self.ids.append(line.rstrip())
65 self.count = len(self.ids) # 0 if query, some value if iuds_file
66
67 def retrieve(self):
68 """
69 Retrieve the iuds and fastas corresponding to the query
70 """
71 if len(self.ids) == 0: # retrieving from query (not required for file)
72 self.count = self.ecount()
73 # If no UIDs were found from query or file, exit
74 if self.count == 0:
75 self.logger.info("found no UIDs. Exiting script.")
76 sys.exit(-1)
77 if not self.iuds_file:
78 self.get_uids_list()
79 self.print_uids_list()
60 else: 80 else:
61 self.get_uids = False 81 # as self.ids already implemented
62 if options.iuds_file: 82 self.print_uids_list()
63 with open(options.iuds_file, 'r') as f: 83 if self.get_fasta:
64 self.ids.extend(f.readline().split(' ')) 84 try:
65 85 self.get_sequences()
66 def dry_run(self): 86 except QueryException as e:
67 self.get_count_value() 87 self.logger.error("Exiting script.")
68 88 raise e
69 def retrieve(self): 89
70 """ 90 def print_uids_list(self):
71 Retrieve the fasta sequences corresponding to the query 91 with open("retrieved_uid_list.txt", 'w') as f:
72 """ 92 f.write('\n'.join(self.ids))
73 if len(self.ids) == 0: 93
74 self.get_count_value() 94 def ecount(self):
75 else:
76 self.count = len(self.ids)
77 # If no UIDs are found exit script
78 if self.count > 0:
79 if len(self.ids) == 0:
80 self.get_uids_list()
81 if not self.get_uids:
82 try:
83 self.get_sequences()
84 except QueryException as e:
85 self.logger.error("Exiting script.")
86 raise e
87 else:
88 with open(self.outname, 'w') as f:
89 f.write('\t'.join(self.ids)+'\n')
90 else:
91 self.logger.error("No UIDs were found. Exiting script.")
92 raise Exception("")
93
94 def get_count_value(self):
95 """ 95 """
96 just to retrieve Count (number of UIDs) 96 just to retrieve Count (number of UIDs)
97 Total number of UIDs from the retrieved set to be shown in the XML 97 Total number of UIDs from the retrieved set to be shown in the XML
98 output (default=20). By default, ESearch only includes the first 20 98 output (default=20). By default, ESearch only includes the first 20
99 UIDs retrieved in the XML output. If usehistory is set to 'y', 99 UIDs retrieved in the XML output. If usehistory is set to 'y',
100 the remainder of the retrieved set will be stored on the History server 100 the remainder of the retrieved set will be stored on the History server
101
102 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch 101 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
103 """ 102 """
104 self.logger.info("retrieving data from %s" % self.base)
105 self.logger.info("for Query: %s and database: %s" %
106 (self.query_string, self.dbname))
107 querylog = self.esearch(self.dbname, self.query_string, '', '', 103 querylog = self.esearch(self.dbname, self.query_string, '', '',
108 "count") 104 'count')
109 self.logger.debug("Query response:") 105 self.logger.debug("Query response:")
110 for line in querylog: 106 for line in querylog:
107 line = line.decode('utf-8')
111 self.logger.debug(line.rstrip()) 108 self.logger.debug(line.rstrip())
112 if '</Count>' in line: 109 if '</Count>' in line:
113 self.count = int(line[line.find('<Count>')+len('<Count>'): 110 count = int(line.split("<Count>")[1].split("</Count>")[0])
114 line.find('</Count>')]) 111 self.logger.info("Found %d UIDs" % count)
115 self.logger.info("Found %d UIDs" % self.count) 112 return count
116 113
117 def get_uids_list(self): 114 def get_uids_list(self):
118 """ 115 """
119 Increasing retmax allows more of the retrieved UIDs to be included in 116 Increasing retmax allows more of the retrieved UIDs to be included in
120 the XML output, up to a maximum of 100,000 records. 117 the XML output, up to a maximum of 100,000 records.
121 from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch 118 from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
122 """ 119 """
123 retmax = self.retmax_esearch 120 retmax = self.retmax_esearch
124 if (self.count > retmax): 121 self.logger.info("retmax = %s, self.count = %s" % (retmax, self.count))
125 num_batches = (self.count / retmax) + 1 122 if (int(self.count) > retmax):
123 num_batches = int(self.count / retmax) + 1
124 self.usehistory = 'y'
125 self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
126 self.logger.info("Number of batches for esearch action: %s"
127 % num_batches)
128 querylog = self.esearch(self.dbname, self.query_string, '', '', '')
129 for line in querylog:
130 line = line.decode('utf-8')
131 self.logger.debug(line.rstrip())
132 if '<WebEnv>' in line:
133 self.webenv = line.split("<WebEnv>")[1].split(
134 "</WebEnv>")[0]
135 self.logger.info("Will use webenv %s" % self.webenv)
136 for n in range(num_batches):
137 querylog = self.esearch(self.dbname, self.query_string,
138 n*retmax, retmax, '')
139 for line in querylog:
140 line = line.decode('utf-8')
141 if '<Id>' in line and '</Id>' in line:
142 uid = line.split("<Id>")[1].split("</Id>")[0]
143 self.ids.append(uid)
144 self.logger.info("Retrieved %d UIDs" % len(self.ids))
145
126 else: 146 else:
127 num_batches = 1 147 self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
128 self.logger.info("Batch size for esearch action: %d UIDs" % retmax) 148 self.logger.info("Number of batches for esearch action: 1")
129 self.logger.info("Number of batches for esearch action: %d " % 149 querylog = self.esearch(self.dbname, self.query_string, 0,
130 num_batches)
131 for n in range(num_batches):
132 querylog = self.esearch(self.dbname, self.query_string, n*retmax,
133 retmax, '') 150 retmax, '')
134 for line in querylog: 151 for line in querylog:
152 line = line.decode('utf-8')
135 if '<Id>' in line and '</Id>' in line: 153 if '<Id>' in line and '</Id>' in line:
136 uid = (line[line.find('<Id>')+len('<Id>'): 154 uid = line.split("<Id>")[1].split("</Id>")[0]
137 line.find('</Id>')])
138 self.ids.append(uid) 155 self.ids.append(uid)
139 self.logger.info("Retrieved %d UIDs" % len(self.ids)) 156 self.logger.info("Retrieved %d UIDs" % len(self.ids))
157 return self.ids
158
159 def get_sequences(self):
160 batch_size = self.retmax_efetch
161 count = self.count
162 uids_list = self.ids
163 self.logger.info("Batch size for efetch action: %d" % batch_size)
164 self.logger.info("Number of batches for efetch action: %d" %
165 ((count / batch_size) + 1))
166 with open(self.get_fasta, 'w') as out:
167 for start in range(0, count, batch_size):
168 end = min(count, start+batch_size)
169 batch = uids_list[start:end]
170 self.logger.info("retrieving batch %d" %
171 ((start / batch_size) + 1))
172 try:
173 mfasta = self.efetch(self.dbname, ','.join(batch))
174 out.write(mfasta + '\n')
175 except QueryException as e:
176 self.logger.error("%s" % e.message)
177 raise e
178 request.urlcleanup()
179
180 def efetch(self, db, uid_list):
181 url = self.base + "efetch.fcgi"
182 self.logger.debug("url_efetch: %s" % url)
183 values = {'db': db,
184 'id': uid_list,
185 'rettype': "fasta",
186 'retmode': "text",
187 'usehistory': self.usehistory,
188 'WebEnv': self.webenv}
189 data = parse.urlencode(values)
190 req = request.Request(url, data.encode('utf-8'))
191 self.logger.debug("data: %s" % str(data))
192 serverTransaction = False
193 counter = 0
194 response_code = 0
195 while not serverTransaction:
196 counter += 1
197 self.logger.info("Server Transaction Trial: %s" % (counter))
198 try:
199 self.logger.debug("Going to open")
200 response = request.urlopen(req)
201 self.logger.debug("Going to get code")
202 response_code = response.getcode()
203 self.logger.debug("Going to read, de code was : %s",
204 str(response_code))
205 fasta = response.read()
206 self.logger.debug("Did all that")
207 response.close()
208 if((response_code != 200) or
209 (b"Resource temporarily unavailable" in fasta) or
210 (b"Error" in fasta) or (not fasta.startswith(b">"))):
211 serverTransaction = False
212 if (response_code != 200):
213 self.logger.info("urlopen error: Response code is not\
214 200")
215 elif ("Resource temporarily unavailable" in fasta):
216 self.logger.info("Ressource temporarily unavailable")
217 elif ("Error" in fasta):
218 self.logger.info("Error in fasta")
219 else:
220 self.logger.info("Fasta doesn't start with '>'")
221 else:
222 serverTransaction = True
223 except error.HTTPError as e:
224 serverTransaction = False
225 self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
226 except error.URLError as e:
227 serverTransaction = False
228 self.logger.info("urlopen error: Failed to reach a server")
229 self.logger.info("Reason :%s" % (e.reason))
230 except http.client.IncompleteRead as e:
231 serverTransaction = False
232 self.logger.info("IncompleteRead error: %s" % (e.partial))
233 if (counter > 500):
234 serverTransaction = True
235 if (counter > 500):
236 raise QueryException({"message":
237 "500 Server Transaction Trials attempted for\
238 this batch. Aborting."})
239 fasta = self.sanitiser(self.dbname, fasta.decode('utf-8'))
240 time.sleep(0.1)
241 return fasta
140 242
141 def esearch(self, db, term, retstart, retmax, rettype): 243 def esearch(self, db, term, retstart, retmax, rettype):
142 url = self.base + "esearch.fcgi" 244 url = self.base + "esearch.fcgi"
143 self.logger.debug("url: %s" % url) 245 self.logger.debug("url: %s" % url)
144 values = {'db': db, 246 values = {'db': db,
145 'term': term, 247 'term': term,
146 'rettype': rettype, 248 'rettype': rettype,
147 'retstart': retstart, 249 'retstart': retstart,
148 'retmax': retmax} 250 'retmax': retmax,
149 data = urllib.urlencode(values) 251 'usehistory': self.usehistory,
252 'WebEnv': self.webenv}
253 data = parse.urlencode(values)
150 self.logger.debug("data: %s" % str(data)) 254 self.logger.debug("data: %s" % str(data))
151 req = urllib2.Request(url, data) 255 req = request.Request(url, data.encode('utf-8'))
152 response = urllib2.urlopen(req) 256 response = request.urlopen(req)
153 querylog = response.readlines() 257 querylog = response.readlines()
154 response.close() 258 response.close()
155 time.sleep(1) 259 time.sleep(1)
156 return querylog 260 return querylog
157 261
158 def sanitiser(self, db, fastaseq): 262 def sanitiser(self, db, fastaseq):
159 if(db not in "nuccore protein"): 263 if(db not in "nuccore protein"):
160 return fastaseq 264 return fastaseq
161 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") 265 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
162 sane_seqlist = [] 266 sane_seqlist = []
163 seqlist = fastaseq.split("\n\n") 267 seqlist = fastaseq.split('\n\n')
164 for seq in seqlist[:-1]: 268 for seq in seqlist[:-1]:
165 fastalines = seq.split("\n") 269 fastalines = seq.split("\n")
166 if len(fastalines) < 2: 270 if len(fastalines) < 2:
167 self.logger.info("Empty sequence for %s" % 271 self.logger.info("Empty sequence for %s" %
168 ("|".join(fastalines[0].split("|")[:4]))) 272 ("|".join(fastalines[0].split("|")[:4])))
180 | %s" % (float(badnuc)/len(fastalines[1]), 284 | %s" % (float(badnuc)/len(fastalines[1]),
181 "|".join(fastalines[0].split("|") 285 "|".join(fastalines[0].split("|")
182 [:4]), 286 [:4]),
183 fastalines[1])) 287 fastalines[1]))
184 self.logger.info("%s download is skipped" % 288 self.logger.info("%s download is skipped" %
185 (fastalines[0].split("|")[:4])) 289 fastalines[0].split("|")[:4])
186 continue 290 continue
187 """ remove spaces and trim the header to 100 chars """ 291 """ remove spaces and trim the header to 100 chars """
188 fastalines[0] = fastalines[0].replace(" ", "_")[:100] 292 fastalines[0] = fastalines[0].replace(" ", "_")[:100]
189 cleanseq = "\n".join(fastalines) 293 cleanseq = "\n".join(fastalines)
190 sane_seqlist.append(cleanseq) 294 sane_seqlist.append(cleanseq)
200 cleanseq = "\n".join(fastalines) 304 cleanseq = "\n".join(fastalines)
201 sane_seqlist.append(cleanseq) 305 sane_seqlist.append(cleanseq)
202 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist))) 306 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist)))
203 return "\n".join(sane_seqlist) 307 return "\n".join(sane_seqlist)
204 308
205 def efetch(self, db, uid_list):
206 url = self.base + "efetch.fcgi"
207 self.logger.debug("url_efetch: %s" % url)
208 values = {'db': db,
209 'id': uid_list,
210 'rettype': "fasta",
211 'retmode': "text"}
212 data = urllib.urlencode(values)
213 req = urllib2.Request(url, data)
214 self.logger.debug("data: %s" % str(data))
215 serverTransaction = False
216 counter = 0
217 response_code = 0
218 while not serverTransaction:
219 counter += 1
220 self.logger.info("Server Transaction Trial: %s" % (counter))
221 try:
222 self.logger.debug("Going to open")
223 response = urllib2.urlopen(req)
224 self.logger.debug("Going to get code")
225 response_code = response.getcode()
226 self.logger.debug("Going to read, de code was : %s",
227 str(response_code))
228 fasta = response.read()
229 self.logger.debug("Did all that")
230 response.close()
231 if((response_code != 200) or
232 ("Resource temporarily unavailable" in fasta) or
233 ("Error" in fasta) or (not fasta.startswith(">"))):
234 serverTransaction = False
235 if (response_code != 200):
236 self.logger.info("urlopen error: Response code is not\
237 200")
238 elif ("Resource temporarily unavailable" in fasta):
239 self.logger.info("Ressource temporarily unavailable")
240 elif ("Error" in fasta):
241 self.logger.info("Error in fasta")
242 else:
243 self.logger.info("Fasta doesn't start with '>'")
244 else:
245 serverTransaction = True
246 except urllib2.HTTPError as e:
247 serverTransaction = False
248 self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
249 except urllib2.URLError as e:
250 serverTransaction = False
251 self.logger.info("urlopen error: Failed to reach a server")
252 self.logger.info("Reason :%s" % (e.reason))
253 except httplib.IncompleteRead as e:
254 serverTransaction = False
255 self.logger.info("IncompleteRead error: %s" % (e.partial))
256 if (counter > 500):
257 serverTransaction = True
258 if (counter > 500):
259 raise QueryException({"message":
260 "500 Server Transaction Trials attempted for\
261 this batch. Aborting."})
262 fasta = self.sanitiser(self.dbname, fasta)
263 time.sleep(0.1)
264 return fasta
265
266 def get_sequences(self):
267 batch_size = 200
268 count = self.count
269 uids_list = self.ids
270 self.logger.info("Batch size for efetch action: %d" % batch_size)
271 self.logger.info("Number of batches for efetch action: %d" %
272 ((count / batch_size) + 1))
273 with open(self.outname, 'w') as out:
274 for start in range(0, count, batch_size):
275 end = min(count, start+batch_size)
276 batch = uids_list[start:end]
277 self.logger.info("retrieving batch %d" %
278 ((start / batch_size) + 1))
279 try:
280 mfasta = self.efetch(self.dbname, ','.join(batch))
281 out.write(mfasta + '\n')
282 except QueryException as e:
283 self.logger.error("%s" % e.message)
284 raise e
285 urllib.urlcleanup()
286
287
288 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
289 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
290 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
291
292 309
293 def command_parse(): 310 def command_parse():
294 parser = argparse.ArgumentParser(description='Retrieve data from NCBI') 311 parser = argparse.ArgumentParser(description='Retrieve data from NCBI')
295 parser.add_argument('-i', dest='query_string', help='NCBI Query String') 312 parser.add_argument('--query', '-i', dest='query_string',
296 parser.add_argument('--UID_list', dest='iuds_file', 313 default=None, help='NCBI Query String')
297 help='file containing a list of iuds to be fetched') 314 parser.add_argument('--iud_file', dest='iuds_file', default=None,
298 parser.add_argument('-o', dest='outname', help='output file name') 315 help='input list of iuds to be fetched')
299 parser.add_argument('-d', dest='dbname', help='database type') 316 parser.add_argument('--dbname', '-d', dest='dbname', help='database type')
300 parser.add_argument('--count', '-c', dest='count_ids', 317 parser.add_argument('--fasta', '-F', dest='get_fasta', default=False,
301 action='store_true', default=False, 318 help='file with retrieved fasta sequences')
302 help='dry run ouputing only the number of sequences\ 319 parser.add_argument('--logfile', '-l', help='log file (default=stderr)')
303 found')
304 parser.add_argument('--get_uids', '-u', dest='get_uids', default=False,
305 action='store_true', help='prints to the output a list\
306 of UIDs')
307 parser.add_argument('-l', '--logfile', help='log file (default=stderr)')
308 parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO', 320 parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO',
309 help='logging level (default: INFO)') 321 help='logging level (default: INFO)')
310 args = parser.parse_args() 322 args = parser.parse_args()
323
311 if args.query_string is not None and args.iuds_file is not None: 324 if args.query_string is not None and args.iuds_file is not None:
312 parser.error('Please choose either fetching the -i query or the -u\ 325 parser.error('Please choose either fetching by query (--query) \
313 list.') 326 or by uid list (--iud_file)')
314 return args 327 return args
315 328
316 329
317 def __main__(): 330 def __main__():
318 """ main function """ 331 """ main function """
323 'level': log_level} 336 'level': log_level}
324 if args.logfile: 337 if args.logfile:
325 kwargs['filename'] = args.logfile 338 kwargs['filename'] = args.logfile
326 logging.basicConfig(**kwargs) 339 logging.basicConfig(**kwargs)
327 logger = logging.getLogger('data_from_NCBI') 340 logger = logging.getLogger('data_from_NCBI')
328
329 E = Eutils(args, logger) 341 E = Eutils(args, logger)
330 if args.count_ids: 342 try:
331 try: 343 E.retrieve()
332 E.dry_run() 344 except Exception:
333 except Exception: 345 sys.exit(-1)
334 sys.exit(-1)
335 else:
336 try:
337 E.retrieve()
338 except Exception:
339 sys.exit(-1)
340 346
341 347
342 if __name__ == "__main__": 348 if __name__ == "__main__":
343 __main__() 349 __main__()