fetch_fasta_from_ncbi: retrieve_fasta_from

comparison retrieve_fasta_from_NCBI.py @ 4:64f45c5e94a0 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ca132a4b5d5d7175e6e8bd62cc518397d14dad17

author	drosofff
date	Mon, 15 May 2017 03:10:11 -0400
parents	a9d8f69d59fb
children	c6de5c7b4ae3

comparison

equal deleted inserted replaced

-:a9d8f69d59fb
+:64f45c5e94a0
 retmax of efetch is 1/10 of declared value from NCBI
 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)
-python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
-python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
 """
 import sys
 import logging
 import optparse
 import time
 class Eutils:
 def __init__(self, options, logger):
+"""
+Initialize retrieval parameters
+"""
 self.logger = logger
 self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
 self.query_string = options.query_string
 self.dbname = options.dbname
 if options.outname:
 self.outname = options.outname
 else:
 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
 self.ids = []
 self.retmax_esearch = 100000
-self.retmax_efetch = 1000
+self.retmax_efetch = 500
 self.count = 0
 self.webenv = ""
 self.query_key = ""
 def retrieve(self):
-""" """
+"""
+Retrieve the fasta sequences corresponding to the query
+"""
 self.get_count_value()
-self.get_uids_list()
-self.get_sequences()
+# If no UIDs are found exit script
+if self.count > 0:
+self.get_uids_list()
+self.get_sequences()
+else:
+self.logger.info("No UIDs were found. Exiting script.")
 def get_count_value(self):
 """
 just to retrieve Count (number of UIDs)
 Total number of UIDs from the retrieved set to be shown in the XML
 self.logger.debug("Query response:")
 for line in querylog:
 self.logger.debug(line.rstrip())
 if '</Count>' in line:
 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')])
-self.logger.info("Founded %d UIDs" % self.count)
+self.logger.info("Found %d UIDs" % self.count)
 def get_uids_list(self):
 """
 Increasing retmax allows more of the retrieved UIDs to be included in the XML output,
 up to a maximum of 100,000 records.
 data = urllib.urlencode(values)
 self.logger.debug("data: %s" % str(data))
 req = urllib2.Request(url, data)
 response = urllib2.urlopen(req)
 querylog = response.readlines()
+response.close()
 time.sleep(1)
 return querylog
 def epost(self, db, ids):
 url = self.base + "epost.fcgi"
 self.logger.debug("url_epost: %s" % url)
 values = {'db': db,
 'id': ids}
 data = urllib.urlencode(values)
 req = urllib2.Request(url, data)
-#self.logger.debug("data: %s" % str(data))
-req = urllib2.Request(url, data)
 serverResponse = False
+nb_trials = 0
 while not serverResponse:
+nb_trials += 1
 try:
+self.logger.debug("Try number %s for opening and readin URL %s" % ( nb_trials, url+data ))
 response = urllib2.urlopen(req)
+querylog = response.readlines()
+response.close()
 serverResponse = True
-except: # catch *all* exceptions
+except urllib2.HTTPError as e:
-e = sys.exc_info()[0]
+self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
-self.logger.info( "Catched Error: %s" % e )
+self.logger.info("Retrying in 1 sec")
-self.logger.info( "Retrying in 10 sec")
+serverResponse = False
-time.sleep(10)
+time.sleep(1)
-querylog = response.readlines()
+except urllib2.URLError as e:
+self.logger.info("urlopen error: Failed to reach a server")
+self.logger.info("Reason :%s" % ( e.reason ) )
+self.logger.info("Retrying in 1 sec")
+serverResponse = False
+time.sleep(1)
+except httplib.IncompleteRead as e:
+self.logger.info("IncompleteRead error:  %s" % ( e.partial ) )
+self.logger.info("Retrying in 1 sec")
+serverResponse = False
+time.sleep(1)
 self.logger.debug("query response:")
 for line in querylog:
 self.logger.debug(line.rstrip())
 if '</QueryKey>' in line:
 self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')])
 'rettype': "fasta",
 'retmode': "text"}
 data = urllib.urlencode(values)
 req = urllib2.Request(url, data)
 self.logger.debug("data: %s" % str(data))
-req = urllib2.Request(url, data)
 serverTransaction = False
 counter = 0
+response_code = 0
 while not serverTransaction:
 counter += 1
 self.logger.info("Server Transaction Trial:  %s" % ( counter ) )
 try:
 response = urllib2.urlopen(req)
+response_code = response.getcode()
 fasta = response.read()
-if ("Resource temporarily unavailable" in fasta) or (not fasta.startswith(">") ):
+response.close()
+if ( (response_code != 200) or ("Resource temporarily unavailable" in fasta)
+or ("Error" in fasta) or (not fasta.startswith(">") ) ):
 serverTransaction = False
 else:
 serverTransaction = True
 except urllib2.HTTPError as e:
 serverTransaction = False
 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
+except urllib2.URLError as e:
+serverTransaction = False
+self.logger.info("urlopen error: Failed to reach a server")
+self.logger.info("Reason :%s" % ( e.reason ) )
 except httplib.IncompleteRead as e:
 serverTransaction = False
 self.logger.info("IncompleteRead error:  %s" % ( e.partial ) )
-fasta = self.sanitiser(self.dbname, fasta) #
+fasta = self.sanitiser(self.dbname, fasta)
-time.sleep(1)
+time.sleep(0.1)
 return fasta
 def sanitiser(self, db, fastaseq):
 if db not in "nuccore protein" : return fastaseq
 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
 self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1))
 with open(self.outname, 'w') as out:
 for start in range(0, count, batch_size):
 end = min(count, start+batch_size)
 batch = uids_list[start:end]
-self.epost(self.dbname, ",".join(batch))
+if self.epost(self.dbname, ",".join(batch)) != -1:
 mfasta = ''
 while not mfasta:
 self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
 mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
 out.write(mfasta + '\n')
 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']

Mercurial > repos > drosofff > fetch_fasta_from_ncbi

comparison retrieve_fasta_from_NCBI.py @ 4:64f45c5e94a0 draft