Mercurial > repos > drosofff > fetch_fasta_from_ncbi
changeset 4:64f45c5e94a0 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ca132a4b5d5d7175e6e8bd62cc518397d14dad17
author | drosofff |
---|---|
date | Mon, 15 May 2017 03:10:11 -0400 |
parents | a9d8f69d59fb |
children | c6de5c7b4ae3 |
files | retrieve_fasta_from_NCBI.py retrieve_fasta_from_NCBI.xml test-data/output.fa |
diffstat | 3 files changed, 56 insertions(+), 27 deletions(-) [+] |
line wrap: on
line diff
--- a/retrieve_fasta_from_NCBI.py Wed Nov 09 11:27:31 2016 -0500 +++ b/retrieve_fasta_from_NCBI.py Mon May 15 03:10:11 2017 -0400 @@ -21,8 +21,6 @@ queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request) -python get_fasta_from_taxon.py -i 1638 -o test.out -d protein -python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs """ import sys import logging @@ -37,6 +35,9 @@ class Eutils: def __init__(self, options, logger): + """ + Initialize retrieval parameters + """ self.logger = logger self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" self.query_string = options.query_string @@ -47,16 +48,23 @@ self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta' self.ids = [] self.retmax_esearch = 100000 - self.retmax_efetch = 1000 + self.retmax_efetch = 500 self.count = 0 self.webenv = "" self.query_key = "" def retrieve(self): - """ """ + """ + Retrieve the fasta sequences corresponding to the query + """ self.get_count_value() - self.get_uids_list() - self.get_sequences() + + # If no UIDs are found exit script + if self.count > 0: + self.get_uids_list() + self.get_sequences() + else: + self.logger.info("No UIDs were found. Exiting script.") def get_count_value(self): """ @@ -77,7 +85,7 @@ self.logger.debug(line.rstrip()) if '</Count>' in line: self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')]) - self.logger.info("Founded %d UIDs" % self.count) + self.logger.info("Found %d UIDs" % self.count) def get_uids_list(self): """ @@ -113,6 +121,7 @@ req = urllib2.Request(url, data) response = urllib2.urlopen(req) querylog = response.readlines() + response.close() time.sleep(1) return querylog @@ -123,19 +132,32 @@ 'id': ids} data = urllib.urlencode(values) req = urllib2.Request(url, data) - #self.logger.debug("data: %s" % str(data)) - req = urllib2.Request(url, data) serverResponse = False + nb_trials = 0 while not serverResponse: + nb_trials += 1 try: + self.logger.debug("Try number %s for opening and readin URL %s" % ( nb_trials, url+data )) response = urllib2.urlopen(req) + querylog = response.readlines() + response.close() serverResponse = True - except: # catch *all* exceptions - e = sys.exc_info()[0] - self.logger.info( "Catched Error: %s" % e ) - self.logger.info( "Retrying in 10 sec") - time.sleep(10) - querylog = response.readlines() + except urllib2.HTTPError as e: + self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) + except urllib2.URLError as e: + self.logger.info("urlopen error: Failed to reach a server") + self.logger.info("Reason :%s" % ( e.reason ) ) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) + except httplib.IncompleteRead as e: + self.logger.info("IncompleteRead error: %s" % ( e.partial ) ) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) self.logger.debug("query response:") for line in querylog: self.logger.debug(line.rstrip()) @@ -159,27 +181,34 @@ data = urllib.urlencode(values) req = urllib2.Request(url, data) self.logger.debug("data: %s" % str(data)) - req = urllib2.Request(url, data) serverTransaction = False counter = 0 + response_code = 0 while not serverTransaction: counter += 1 self.logger.info("Server Transaction Trial: %s" % ( counter ) ) try: response = urllib2.urlopen(req) + response_code = response.getcode() fasta = response.read() - if ("Resource temporarily unavailable" in fasta) or (not fasta.startswith(">") ): + response.close() + if ( (response_code != 200) or ("Resource temporarily unavailable" in fasta) + or ("Error" in fasta) or (not fasta.startswith(">") ) ): serverTransaction = False else: serverTransaction = True except urllib2.HTTPError as e: serverTransaction = False self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) + except urllib2.URLError as e: + serverTransaction = False + self.logger.info("urlopen error: Failed to reach a server") + self.logger.info("Reason :%s" % ( e.reason ) ) except httplib.IncompleteRead as e: serverTransaction = False self.logger.info("IncompleteRead error: %s" % ( e.partial ) ) - fasta = self.sanitiser(self.dbname, fasta) # - time.sleep(1) + fasta = self.sanitiser(self.dbname, fasta) + time.sleep(0.1) return fasta def sanitiser(self, db, fastaseq): @@ -237,12 +266,12 @@ for start in range(0, count, batch_size): end = min(count, start+batch_size) batch = uids_list[start:end] - self.epost(self.dbname, ",".join(batch)) - mfasta = '' - while not mfasta: - self.logger.info("retrieving batch %d" % ((start / batch_size) + 1)) - mfasta = self.efetch(self.dbname, self.query_key, self.webenv) - out.write(mfasta + '\n') + if self.epost(self.dbname, ",".join(batch)) != -1: + mfasta = '' + while not mfasta: + self.logger.info("retrieving batch %d" % ((start / batch_size) + 1)) + mfasta = self.efetch(self.dbname, self.query_key, self.webenv) + out.write(mfasta + '\n') LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
--- a/retrieve_fasta_from_NCBI.xml Wed Nov 09 11:27:31 2016 -0500 +++ b/retrieve_fasta_from_NCBI.xml Mon May 15 03:10:11 2017 -0400 @@ -1,4 +1,4 @@ -<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="0.9.4"> +<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="1.0.0"> <description></description> <command><![CDATA[ python '$__tool_directory__'/retrieve_fasta_from_NCBI.py
--- a/test-data/output.fa Wed Nov 09 11:27:31 2016 -0500 +++ b/test-data/output.fa Mon May 15 03:10:11 2017 -0400 @@ -1,4 +1,4 @@ ->NC_001834.1_Drosophila_C_virus,_complete_genome +>NC_001834.1_Drosophila_C_virus_strain_EB,_complete_genome TTTATATCGTGTGTACATATAAATATGTACACACGGCTTTTAGGTAGAATATTGTTTTCAATGTTGATTT TAAAGGTAACTTTGGTTATTATGCTTTACGGTTTTCATTGTTGATGGTATTTGTGGCCTGCGGTCCCTAA TTGTTGAATTATTTATTCTGATACGTTGTTTTCATTGTTGATGGTAAGGATTCTTATTTTGAAGTGGTTT