Mercurial > repos > crs4 > get_fasta_from_taxon

# -*- coding: utf-8 -*-
"""
From a taxonomy ID retrieves all the nucleotide sequences
It returns a multiFASTA nuc/prot file

Entrez Database  UID common name  E-utility Database Name
Nucleotide       GI number        nuccore
Protein          GI number        protein

Retrieve strategy:

esearch to get total number of UIDs (count)
esearch to get UIDs in batches
loop untile end of UIDs list:
  epost to put a batch of UIDs in the history server
  efetch to retrieve info from previous post

retmax of efetch is 1/10 of declared value from NCBI

queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)


python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
"""

import logging
import optparse
import time
import urllib
import urllib2

class Eutils:

    def __init__(self, options, logger):
        self.logger = logger
        self.base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        self.taxid = options.taxid
        self.dbname = options.dbname
        if options.outname:
            self.outname = options.outname
        else:
            self.outname = 'taxid' + self.taxid + '.' + self.dbname + '.fasta'
        self.ids = []
        self.retmax_esearch = 100000
        self.retmax_efetch = 1000
        self.count = 0
        self.webenv = ""
        self.query_key = ""

    def retrieve(self):
        """ """
        self.get_count_value()
        self.get_uids_list()
        self.get_sequences()

    def get_count_value(self):
        """
        just to retrieve Count (number of UIDs)
        Total number of UIDs from the retrieved set to be shown in the XML
        output (default=20). By default, ESearch only includes the first 20
        UIDs retrieved in the XML output. If usehistory is set to 'y',
        the remainder of the retrieved set will be stored on the History server;

        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
        """
        self.logger.info("retrieving data from %s" % self.base)
        self.logger.info("for TaxID: %s and database: %s" %
                         (self.taxid, self.dbname))
        querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, '', '', "count")
        self.logger.debug("taxonomy response:")
        for line in querylog:
            self.logger.debug(line.rstrip())
            if '</Count>' in line:
                self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')])
        self.logger.info("Founded %d UIDs" % self.count)

    def get_uids_list(self):
        """
        Increasing retmax allows more of the retrieved UIDs to be included in the XML output,
        up to a maximum of 100,000 records.
        from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        """
        retmax = self.retmax_esearch
        if (self.count > retmax):
            num_batches = (self.count / retmax) + 1
        else:
            num_batches = 1
        self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
        self.logger.info("Number of batches for esearch action: %d " % num_batches)
        for n in range(num_batches):
            querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, n*retmax, retmax, '')
            for line in querylog:
                if '<Id>' in line and '</Id>' in line:
                    uid = (line[line.find('<Id>')+len('<Id>') : line.find('</Id>')])
                    self.ids.append(uid)
            self.logger.info("Retrieved %d UIDs" % len(self.ids))

    def esearch(self, db, term, retstart, retmax, rettype):
        url = self.base + "esearch.fcgi"
        self.logger.debug("url: %s" % url)
        values = {'db': db,
                  'term': term,
                  'rettype': rettype,
                  'retstart': retstart,
                  'retmax': retmax}
        data = urllib.urlencode(values)
        self.logger.debug("data: %s" % str(data))
        req = urllib2.Request(url, data)
        response = urllib2.urlopen(req)
        querylog = response.readlines()
        time.sleep(1)
        return querylog

    def epost(self, db, ids):
        url = self.base + "epost.fcgi"
        self.logger.debug("url_epost: %s" % url)
        values = {'db': db,
                  'id': ids}
        data = urllib.urlencode(values)
        req = urllib2.Request(url, data)
        #self.logger.debug("data: %s" % str(data))
        req = urllib2.Request(url, data)
        response = urllib2.urlopen(req)
        querylog = response.readlines()
        self.logger.debug("taxonomy response:")
        for line in querylog:
            self.logger.debug(line.rstrip())
            if '</QueryKey>' in line:
                self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')])
            if '</WebEnv>' in line:
                self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):line.find('</WebEnv>')])
            self.logger.debug("*** epost action ***")
            self.logger.debug("query_key: %s" % self.query_key)
            self.logger.debug("webenv: %s" % self.webenv)
        time.sleep(1)

    def efetch(self, db, query_key, webenv):
        url = self.base + "efetch.fcgi"
        self.logger.debug("url_efetch: %s" % url)
        values = {'db': db,
                  'query_key': query_key,
                  'webenv': webenv,
                  'rettype': "fasta",
                  'retmode': "text"}
        data = urllib.urlencode(values)
        req = urllib2.Request(url, data)
        self.logger.debug("data: %s" % str(data))
        req = urllib2.Request(url, data)
        response = urllib2.urlopen(req)
        fasta = response.read()
        assert fasta.startswith(">"), fasta
        time.sleep(1)
        return fasta

    def get_sequences(self):
        """
        Total number of records from the input set to be retrieved, up to a maximum
        of 10,000. Optionally, for a large set the value of retstart can be iterated
        while holding retmax constant, thereby downloading the entire set in batches
        of size retmax.

        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch

        """
        batch_size = self.retmax_efetch
        count = self.count
        uids_list = self.ids
        self.logger.info("Batch size for efetch action: %d" % batch_size)
        self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1))
        with open(self.outname, 'w') as out:
            for start in range(0, count, batch_size):
                end = min(count, start+batch_size)
                batch = uids_list[start:end]
                self.epost(self.dbname, ",".join(batch))
                self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
                mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
                out.write(mfasta + '\n')


LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']


def __main__():
    """ main function """
    parser = optparse.OptionParser(description='Retrieve data from NCBI')
    parser.add_option('-i', dest='taxid', help='taxonomy ID mandatory')
    parser.add_option('-o', dest='outname', help='output file name')
    parser.add_option('-l', '--logfile', help='log file (default=stderr)')
    parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)')
    parser.add_option('-d', dest='dbname', help='database type')
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error('Wrong number of arguments')

    log_level = getattr(logging, options.loglevel)
    kwargs = {'format': LOG_FORMAT,
              'datefmt': LOG_DATEFMT,
              'level': log_level}
    if options.logfile:
        kwargs['filename'] = options.logfile
    logging.basicConfig(**kwargs)
    logger = logging.getLogger('data_from_NCBI')

    E = Eutils(options, logger)
    E.retrieve()


if __name__ == "__main__":
    __main__()
author	crs4
date	Wed, 11 Sep 2013 06:07:50 -0400
parents
children