comparison fetch_fasta_from_NCBI.py @ 3:8be88084f89c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ab45487db8cc69750f92a40d763d51ffac940e25
author artbio
date Wed, 08 Nov 2017 13:00:26 -0500
parents 50f5ef3313bb
children c667d0ee39f5
comparison
equal deleted inserted replaced
2:50f5ef3313bb 3:8be88084f89c
19 retmax of efetch is 1/10 of declared value from NCBI 19 retmax of efetch is 1/10 of declared value from NCBI
20 20
21 queries are 1 sec delayed, to satisfy NCBI guidelines 21 queries are 1 sec delayed, to satisfy NCBI guidelines
22 (more than what they request) 22 (more than what they request)
23 """ 23 """
24 import argparse
24 import httplib 25 import httplib
25 import logging 26 import logging
26 import optparse
27 import re 27 import re
28 import sys 28 import sys
29 import time 29 import time
30 import urllib 30 import urllib
31 import urllib2 31 import urllib2
53 self.retmax_esearch = 100000 53 self.retmax_esearch = 100000
54 self.retmax_efetch = 500 54 self.retmax_efetch = 500
55 self.count = 0 55 self.count = 0
56 self.webenv = "" 56 self.webenv = ""
57 self.query_key = "" 57 self.query_key = ""
58 self.datetype = options.datetype 58
59 if options.reldate: 59 def dry_run(self):
60 self.reldate = options.reldate 60 self.get_count_value()
61 else:
62 self.reldate = ''
63 if options.mindate:
64 self.mindate = options.mindate
65 else:
66 self.mindate = ''
67 if options.maxdate:
68 self.maxdate = options.maxdate
69 else:
70 self.maxdate = ''
71 61
72 def retrieve(self): 62 def retrieve(self):
73 """ 63 """
74 Retrieve the fasta sequences corresponding to the query 64 Retrieve the fasta sequences corresponding to the query
75 """ 65 """
76 self.get_count_value() 66 self.get_count_value()
77
78 # If no UIDs are found exit script 67 # If no UIDs are found exit script
79 if self.count > 0: 68 if self.count > 0:
80 self.get_uids_list() 69 self.get_uids_list()
81 try: 70 try:
82 self.get_sequences() 71 self.get_sequences()
99 """ 88 """
100 self.logger.info("retrieving data from %s" % self.base) 89 self.logger.info("retrieving data from %s" % self.base)
101 self.logger.info("for Query: %s and database: %s" % 90 self.logger.info("for Query: %s and database: %s" %
102 (self.query_string, self.dbname)) 91 (self.query_string, self.dbname))
103 querylog = self.esearch(self.dbname, self.query_string, '', '', 92 querylog = self.esearch(self.dbname, self.query_string, '', '',
104 "count", self.datetype, self.reldate, 93 "count")
105 self.mindate, self.maxdate)
106 self.logger.debug("Query response:") 94 self.logger.debug("Query response:")
107 for line in querylog: 95 for line in querylog:
108 self.logger.debug(line.rstrip()) 96 self.logger.debug(line.rstrip())
109 if '</Count>' in line: 97 if '</Count>' in line:
110 self.count = int(line[line.find('<Count>')+len('<Count>'): 98 self.count = int(line[line.find('<Count>')+len('<Count>'):
125 self.logger.info("Batch size for esearch action: %d UIDs" % retmax) 113 self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
126 self.logger.info("Number of batches for esearch action: %d " % 114 self.logger.info("Number of batches for esearch action: %d " %
127 num_batches) 115 num_batches)
128 for n in range(num_batches): 116 for n in range(num_batches):
129 querylog = self.esearch(self.dbname, self.query_string, n*retmax, 117 querylog = self.esearch(self.dbname, self.query_string, n*retmax,
130 retmax, '', self.datetype, self.reldate, 118 retmax, '')
131 self.mindate, self.maxdate)
132 for line in querylog: 119 for line in querylog:
133 if '<Id>' in line and '</Id>' in line: 120 if '<Id>' in line and '</Id>' in line:
134 uid = (line[line.find('<Id>')+len('<Id>'): 121 uid = (line[line.find('<Id>')+len('<Id>'):
135 line.find('</Id>')]) 122 line.find('</Id>')])
136 self.ids.append(uid) 123 self.ids.append(uid)
137 self.logger.info("Retrieved %d UIDs" % len(self.ids)) 124 self.logger.info("Retrieved %d UIDs" % len(self.ids))
138 125
139 def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate, 126 def esearch(self, db, term, retstart, retmax, rettype):
140 mindate, maxdate):
141 url = self.base + "esearch.fcgi" 127 url = self.base + "esearch.fcgi"
142 self.logger.debug("url: %s" % url) 128 self.logger.debug("url: %s" % url)
143 values = {'db': db, 129 values = {'db': db,
144 'term': term, 130 'term': term,
145 'rettype': rettype, 131 'rettype': rettype,
146 'retstart': retstart, 132 'retstart': retstart,
147 'retmax': retmax, 133 'retmax': retmax}
148 'datetype': datetype,
149 'reldate': reldate,
150 'mindate': mindate,
151 'maxdate': maxdate}
152 data = urllib.urlencode(values) 134 data = urllib.urlencode(values)
153 self.logger.debug("data: %s" % str(data)) 135 self.logger.debug("data: %s" % str(data))
154 req = urllib2.Request(url, data) 136 req = urllib2.Request(url, data)
155 response = urllib2.urlopen(req) 137 response = urllib2.urlopen(req)
156 querylog = response.readlines() 138 querylog = response.readlines()
223 response_code = 0 205 response_code = 0
224 while not serverTransaction: 206 while not serverTransaction:
225 counter += 1 207 counter += 1
226 self.logger.info("Server Transaction Trial: %s" % (counter)) 208 self.logger.info("Server Transaction Trial: %s" % (counter))
227 try: 209 try:
210 self.logger.debug("Going to open")
228 response = urllib2.urlopen(req) 211 response = urllib2.urlopen(req)
212 self.logger.debug("Going to get code")
229 response_code = response.getcode() 213 response_code = response.getcode()
214 self.logger.debug("Going to read, de code was : %s",
215 str(response_code))
230 fasta = response.read() 216 fasta = response.read()
217 self.logger.debug("Did all that")
231 response.close() 218 response.close()
232 if((response_code != 200) or 219 if((response_code != 200) or
233 ("Resource temporarily unavailable" in fasta) or 220 ("Resource temporarily unavailable" in fasta) or
234 ("Error" in fasta) or (not fasta.startswith(">"))): 221 ("Error" in fasta) or (not fasta.startswith(">"))):
235 serverTransaction = False 222 serverTransaction = False
346 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' 333 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
347 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' 334 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
348 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] 335 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
349 336
350 337
338 def command_parse():
339 parser = argparse.ArgumentParser(description='Retrieve data from NCBI')
340 parser.add_argument('-i', dest='query_string', help='NCBI Query String',
341 required=True)
342 parser.add_argument('-o', dest='outname', help='output file name')
343 parser.add_argument('-d', dest='dbname', help='database type')
344 parser.add_argument('--count', '-c', dest='count_ids',
345 action='store_true', default=False,
346 help='dry run ouputing only the number of sequences\
347 found')
348 parser.add_argument('-l', '--logfile', help='log file (default=stderr)')
349 parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO',
350 help='logging level (default: INFO)')
351 args = parser.parse_args()
352 return args
353
354
351 def __main__(): 355 def __main__():
352 """ main function """ 356 """ main function """
353 parser = optparse.OptionParser(description='Retrieve data from NCBI') 357 args = command_parse()
354 parser.add_option('-i', dest='query_string', help='NCBI Query String') 358 log_level = getattr(logging, args.loglevel)
355 parser.add_option('-o', dest='outname', help='output file name')
356 parser.add_option('-d', dest='dbname', help='database type')
357 parser.add_option('-l', '--logfile', help='log file (default=stderr)')
358 parser.add_option('--datetype', dest='datetype',
359 choices=['mdat', 'pdat'],
360 help='Type of date used to limit a search.\
361 [ mdat(modification date), pdat(publication date)]\
362 (default=pdat)', default='pdat')
363 parser.add_option('--reldate', dest='reldate',
364 help='When reldate is set to an integer n, the search\
365 returns only those items that have a date\
366 specified by datetype within the last n days.')
367 parser.add_option('--maxdate', dest='maxdate',
368 help='Date range used to limit a search result by the\
369 date specified by datetype. These two parameters\
370 (mindate, maxdate) must be used together to\
371 specify an arbitrary date range. The general date\
372 format is YYYY/MM/DD, and these variants are also\
373 allowed: YYYY, YYYY/MM.')
374 parser.add_option('--mindate', dest='mindate',
375 help='Date range used to limit a search result by the\
376 date specified by datetype. These two parameters\
377 (mindate, maxdate) must be used together to\
378 specify an arbitrary date range. The general date\
379 format is YYYY/MM/DD, and these variants are also\
380 allowed: YYYY, YYYY/MM.')
381 parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO',
382 help='logging level (default: INFO)')
383 (options, args) = parser.parse_args()
384 if len(args) > 0:
385 parser.error('Wrong number of arguments')
386 if((options.reldate and options.maxdate) or
387 (options.reldate and options.mindate)):
388 parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\
389 parameters")
390 if((options.mindate and not options.maxdate) or
391 (options.maxdate and not options.mindate)):
392 parser.error("mindate and maxdate must be used together")
393
394 log_level = getattr(logging, options.loglevel)
395 kwargs = {'format': LOG_FORMAT, 359 kwargs = {'format': LOG_FORMAT,
396 'datefmt': LOG_DATEFMT, 360 'datefmt': LOG_DATEFMT,
397 'level': log_level} 361 'level': log_level}
398 if options.logfile: 362 if args.logfile:
399 kwargs['filename'] = options.logfile 363 kwargs['filename'] = args.logfile
400 logging.basicConfig(**kwargs) 364 logging.basicConfig(**kwargs)
401 logger = logging.getLogger('data_from_NCBI') 365 logger = logging.getLogger('data_from_NCBI')
402 366
403 E = Eutils(options, logger) 367 E = Eutils(args, logger)
404 try: 368 if args.count_ids:
405 E.retrieve() 369 try:
406 except Exception: 370 E.dry_run()
407 sys.exit(1) 371 except Exception:
372 sys.exit(1)
373 else:
374 try:
375 E.retrieve()
376 except Exception:
377 sys.exit(1)
408 378
409 379
410 if __name__ == "__main__": 380 if __name__ == "__main__":
411 __main__() 381 __main__()