Mercurial > repos > artbio > fetch_fasta_from_ncbi
comparison fetch_fasta_from_NCBI.py @ 3:8be88084f89c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ab45487db8cc69750f92a40d763d51ffac940e25
author | artbio |
---|---|
date | Wed, 08 Nov 2017 13:00:26 -0500 |
parents | 50f5ef3313bb |
children | c667d0ee39f5 |
comparison
equal
deleted
inserted
replaced
2:50f5ef3313bb | 3:8be88084f89c |
---|---|
19 retmax of efetch is 1/10 of declared value from NCBI | 19 retmax of efetch is 1/10 of declared value from NCBI |
20 | 20 |
21 queries are 1 sec delayed, to satisfy NCBI guidelines | 21 queries are 1 sec delayed, to satisfy NCBI guidelines |
22 (more than what they request) | 22 (more than what they request) |
23 """ | 23 """ |
24 import argparse | |
24 import httplib | 25 import httplib |
25 import logging | 26 import logging |
26 import optparse | |
27 import re | 27 import re |
28 import sys | 28 import sys |
29 import time | 29 import time |
30 import urllib | 30 import urllib |
31 import urllib2 | 31 import urllib2 |
53 self.retmax_esearch = 100000 | 53 self.retmax_esearch = 100000 |
54 self.retmax_efetch = 500 | 54 self.retmax_efetch = 500 |
55 self.count = 0 | 55 self.count = 0 |
56 self.webenv = "" | 56 self.webenv = "" |
57 self.query_key = "" | 57 self.query_key = "" |
58 self.datetype = options.datetype | 58 |
59 if options.reldate: | 59 def dry_run(self): |
60 self.reldate = options.reldate | 60 self.get_count_value() |
61 else: | |
62 self.reldate = '' | |
63 if options.mindate: | |
64 self.mindate = options.mindate | |
65 else: | |
66 self.mindate = '' | |
67 if options.maxdate: | |
68 self.maxdate = options.maxdate | |
69 else: | |
70 self.maxdate = '' | |
71 | 61 |
72 def retrieve(self): | 62 def retrieve(self): |
73 """ | 63 """ |
74 Retrieve the fasta sequences corresponding to the query | 64 Retrieve the fasta sequences corresponding to the query |
75 """ | 65 """ |
76 self.get_count_value() | 66 self.get_count_value() |
77 | |
78 # If no UIDs are found exit script | 67 # If no UIDs are found exit script |
79 if self.count > 0: | 68 if self.count > 0: |
80 self.get_uids_list() | 69 self.get_uids_list() |
81 try: | 70 try: |
82 self.get_sequences() | 71 self.get_sequences() |
99 """ | 88 """ |
100 self.logger.info("retrieving data from %s" % self.base) | 89 self.logger.info("retrieving data from %s" % self.base) |
101 self.logger.info("for Query: %s and database: %s" % | 90 self.logger.info("for Query: %s and database: %s" % |
102 (self.query_string, self.dbname)) | 91 (self.query_string, self.dbname)) |
103 querylog = self.esearch(self.dbname, self.query_string, '', '', | 92 querylog = self.esearch(self.dbname, self.query_string, '', '', |
104 "count", self.datetype, self.reldate, | 93 "count") |
105 self.mindate, self.maxdate) | |
106 self.logger.debug("Query response:") | 94 self.logger.debug("Query response:") |
107 for line in querylog: | 95 for line in querylog: |
108 self.logger.debug(line.rstrip()) | 96 self.logger.debug(line.rstrip()) |
109 if '</Count>' in line: | 97 if '</Count>' in line: |
110 self.count = int(line[line.find('<Count>')+len('<Count>'): | 98 self.count = int(line[line.find('<Count>')+len('<Count>'): |
125 self.logger.info("Batch size for esearch action: %d UIDs" % retmax) | 113 self.logger.info("Batch size for esearch action: %d UIDs" % retmax) |
126 self.logger.info("Number of batches for esearch action: %d " % | 114 self.logger.info("Number of batches for esearch action: %d " % |
127 num_batches) | 115 num_batches) |
128 for n in range(num_batches): | 116 for n in range(num_batches): |
129 querylog = self.esearch(self.dbname, self.query_string, n*retmax, | 117 querylog = self.esearch(self.dbname, self.query_string, n*retmax, |
130 retmax, '', self.datetype, self.reldate, | 118 retmax, '') |
131 self.mindate, self.maxdate) | |
132 for line in querylog: | 119 for line in querylog: |
133 if '<Id>' in line and '</Id>' in line: | 120 if '<Id>' in line and '</Id>' in line: |
134 uid = (line[line.find('<Id>')+len('<Id>'): | 121 uid = (line[line.find('<Id>')+len('<Id>'): |
135 line.find('</Id>')]) | 122 line.find('</Id>')]) |
136 self.ids.append(uid) | 123 self.ids.append(uid) |
137 self.logger.info("Retrieved %d UIDs" % len(self.ids)) | 124 self.logger.info("Retrieved %d UIDs" % len(self.ids)) |
138 | 125 |
139 def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate, | 126 def esearch(self, db, term, retstart, retmax, rettype): |
140 mindate, maxdate): | |
141 url = self.base + "esearch.fcgi" | 127 url = self.base + "esearch.fcgi" |
142 self.logger.debug("url: %s" % url) | 128 self.logger.debug("url: %s" % url) |
143 values = {'db': db, | 129 values = {'db': db, |
144 'term': term, | 130 'term': term, |
145 'rettype': rettype, | 131 'rettype': rettype, |
146 'retstart': retstart, | 132 'retstart': retstart, |
147 'retmax': retmax, | 133 'retmax': retmax} |
148 'datetype': datetype, | |
149 'reldate': reldate, | |
150 'mindate': mindate, | |
151 'maxdate': maxdate} | |
152 data = urllib.urlencode(values) | 134 data = urllib.urlencode(values) |
153 self.logger.debug("data: %s" % str(data)) | 135 self.logger.debug("data: %s" % str(data)) |
154 req = urllib2.Request(url, data) | 136 req = urllib2.Request(url, data) |
155 response = urllib2.urlopen(req) | 137 response = urllib2.urlopen(req) |
156 querylog = response.readlines() | 138 querylog = response.readlines() |
223 response_code = 0 | 205 response_code = 0 |
224 while not serverTransaction: | 206 while not serverTransaction: |
225 counter += 1 | 207 counter += 1 |
226 self.logger.info("Server Transaction Trial: %s" % (counter)) | 208 self.logger.info("Server Transaction Trial: %s" % (counter)) |
227 try: | 209 try: |
210 self.logger.debug("Going to open") | |
228 response = urllib2.urlopen(req) | 211 response = urllib2.urlopen(req) |
212 self.logger.debug("Going to get code") | |
229 response_code = response.getcode() | 213 response_code = response.getcode() |
214 self.logger.debug("Going to read, de code was : %s", | |
215 str(response_code)) | |
230 fasta = response.read() | 216 fasta = response.read() |
217 self.logger.debug("Did all that") | |
231 response.close() | 218 response.close() |
232 if((response_code != 200) or | 219 if((response_code != 200) or |
233 ("Resource temporarily unavailable" in fasta) or | 220 ("Resource temporarily unavailable" in fasta) or |
234 ("Error" in fasta) or (not fasta.startswith(">"))): | 221 ("Error" in fasta) or (not fasta.startswith(">"))): |
235 serverTransaction = False | 222 serverTransaction = False |
346 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' | 333 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' |
347 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' | 334 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' |
348 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] | 335 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] |
349 | 336 |
350 | 337 |
338 def command_parse(): | |
339 parser = argparse.ArgumentParser(description='Retrieve data from NCBI') | |
340 parser.add_argument('-i', dest='query_string', help='NCBI Query String', | |
341 required=True) | |
342 parser.add_argument('-o', dest='outname', help='output file name') | |
343 parser.add_argument('-d', dest='dbname', help='database type') | |
344 parser.add_argument('--count', '-c', dest='count_ids', | |
345 action='store_true', default=False, | |
346 help='dry run ouputing only the number of sequences\ | |
347 found') | |
348 parser.add_argument('-l', '--logfile', help='log file (default=stderr)') | |
349 parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO', | |
350 help='logging level (default: INFO)') | |
351 args = parser.parse_args() | |
352 return args | |
353 | |
354 | |
351 def __main__(): | 355 def __main__(): |
352 """ main function """ | 356 """ main function """ |
353 parser = optparse.OptionParser(description='Retrieve data from NCBI') | 357 args = command_parse() |
354 parser.add_option('-i', dest='query_string', help='NCBI Query String') | 358 log_level = getattr(logging, args.loglevel) |
355 parser.add_option('-o', dest='outname', help='output file name') | |
356 parser.add_option('-d', dest='dbname', help='database type') | |
357 parser.add_option('-l', '--logfile', help='log file (default=stderr)') | |
358 parser.add_option('--datetype', dest='datetype', | |
359 choices=['mdat', 'pdat'], | |
360 help='Type of date used to limit a search.\ | |
361 [ mdat(modification date), pdat(publication date)]\ | |
362 (default=pdat)', default='pdat') | |
363 parser.add_option('--reldate', dest='reldate', | |
364 help='When reldate is set to an integer n, the search\ | |
365 returns only those items that have a date\ | |
366 specified by datetype within the last n days.') | |
367 parser.add_option('--maxdate', dest='maxdate', | |
368 help='Date range used to limit a search result by the\ | |
369 date specified by datetype. These two parameters\ | |
370 (mindate, maxdate) must be used together to\ | |
371 specify an arbitrary date range. The general date\ | |
372 format is YYYY/MM/DD, and these variants are also\ | |
373 allowed: YYYY, YYYY/MM.') | |
374 parser.add_option('--mindate', dest='mindate', | |
375 help='Date range used to limit a search result by the\ | |
376 date specified by datetype. These two parameters\ | |
377 (mindate, maxdate) must be used together to\ | |
378 specify an arbitrary date range. The general date\ | |
379 format is YYYY/MM/DD, and these variants are also\ | |
380 allowed: YYYY, YYYY/MM.') | |
381 parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', | |
382 help='logging level (default: INFO)') | |
383 (options, args) = parser.parse_args() | |
384 if len(args) > 0: | |
385 parser.error('Wrong number of arguments') | |
386 if((options.reldate and options.maxdate) or | |
387 (options.reldate and options.mindate)): | |
388 parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\ | |
389 parameters") | |
390 if((options.mindate and not options.maxdate) or | |
391 (options.maxdate and not options.mindate)): | |
392 parser.error("mindate and maxdate must be used together") | |
393 | |
394 log_level = getattr(logging, options.loglevel) | |
395 kwargs = {'format': LOG_FORMAT, | 359 kwargs = {'format': LOG_FORMAT, |
396 'datefmt': LOG_DATEFMT, | 360 'datefmt': LOG_DATEFMT, |
397 'level': log_level} | 361 'level': log_level} |
398 if options.logfile: | 362 if args.logfile: |
399 kwargs['filename'] = options.logfile | 363 kwargs['filename'] = args.logfile |
400 logging.basicConfig(**kwargs) | 364 logging.basicConfig(**kwargs) |
401 logger = logging.getLogger('data_from_NCBI') | 365 logger = logging.getLogger('data_from_NCBI') |
402 | 366 |
403 E = Eutils(options, logger) | 367 E = Eutils(args, logger) |
404 try: | 368 if args.count_ids: |
405 E.retrieve() | 369 try: |
406 except Exception: | 370 E.dry_run() |
407 sys.exit(1) | 371 except Exception: |
372 sys.exit(1) | |
373 else: | |
374 try: | |
375 E.retrieve() | |
376 except Exception: | |
377 sys.exit(1) | |
408 | 378 |
409 | 379 |
410 if __name__ == "__main__": | 380 if __name__ == "__main__": |
411 __main__() | 381 __main__() |