changeset 1:7e41bbb94159 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit 6008aafac37eec1916d6b72c05d9cfcb002b8095
author artbio
date Sun, 15 Oct 2017 13:56:26 -0400
parents c877548ebde1
children 50f5ef3313bb
files fetch_fasta_from_NCBI.py fetch_fasta_from_NCBI.xml retrieve_fasta_from_NCBI.py retrieve_fasta_from_NCBI.xml
diffstat 4 files changed, 506 insertions(+), 507 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fetch_fasta_from_NCBI.py	Sun Oct 15 13:56:26 2017 -0400
@@ -0,0 +1,411 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+From a taxonomy ID retrieves all the nucleotide sequences
+It returns a multiFASTA nuc/prot file
+
+Entrez Database  UID common name  E-utility Database Name
+Nucleotide       GI number        nuccore
+Protein          GI number        protein
+
+Retrieve strategy:
+
+esearch to get total number of UIDs (count)
+esearch to get UIDs in batches
+loop untile end of UIDs list:
+  epost to put a batch of UIDs in the history server
+  efetch to retrieve info from previous post
+
+retmax of efetch is 1/10 of declared value from NCBI
+
+queries are 1 sec delayed, to satisfy NCBI guidelines
+(more than what they request)
+"""
+import httplib
+import logging
+import optparse
+import re
+import sys
+import time
+import urllib
+import urllib2
+
+
+class QueryException(Exception):
+    pass
+
+
+class Eutils:
+
+    def __init__(self, options, logger):
+        """
+        Initialize retrieval parameters
+        """
+        self.logger = logger
+        self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        self.query_string = options.query_string
+        self.dbname = options.dbname
+        if options.outname:
+            self.outname = options.outname
+        else:
+            self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
+        self.ids = []
+        self.retmax_esearch = 100000
+        self.retmax_efetch = 500
+        self.count = 0
+        self.webenv = ""
+        self.query_key = ""
+        self.datetype = options.datetype
+        if options.reldate:
+            self.reldate = options.reldate
+        else:
+            self.reldate = ''
+        if options.mindate:
+            self.mindate = options.mindate
+        else:
+            self.mindate = ''
+        if options.maxdate:
+            self.maxdate = options.maxdate
+        else:
+            self.maxdate = ''
+
+    def retrieve(self):
+        """
+        Retrieve the fasta sequences corresponding to the query
+        """
+        self.get_count_value()
+
+        # If no UIDs are found exit script
+        if self.count > 0:
+            self.get_uids_list()
+            try:
+                self.get_sequences()
+            except QueryException as e:
+                self.logger.error("Exiting script.")
+                raise e
+        else:
+            self.logger.error("No UIDs were found. Exiting script.")
+            raise Exception("")
+
+    def get_count_value(self):
+        """
+        just to retrieve Count (number of UIDs)
+        Total number of UIDs from the retrieved set to be shown in the XML
+        output (default=20). By default, ESearch only includes the first 20
+        UIDs retrieved in the XML output. If usehistory is set to 'y',
+        the remainder of the retrieved set will be stored on the History server
+
+        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+        """
+        self.logger.info("retrieving data from %s" % self.base)
+        self.logger.info("for Query: %s and database: %s" %
+                         (self.query_string, self.dbname))
+        querylog = self.esearch(self.dbname, self.query_string, '', '',
+                                "count", self.datetype, self.reldate,
+                                self.mindate, self.maxdate)
+        self.logger.debug("Query response:")
+        for line in querylog:
+            self.logger.debug(line.rstrip())
+            if '</Count>' in line:
+                self.count = int(line[line.find('<Count>')+len('<Count>'):
+                                 line.find('</Count>')])
+        self.logger.info("Found %d UIDs" % self.count)
+
+    def get_uids_list(self):
+        """
+        Increasing retmax allows more of the retrieved UIDs to be included in
+        the XML output, up to a maximum of 100,000 records.
+        from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
+        """
+        retmax = self.retmax_esearch
+        if (self.count > retmax):
+            num_batches = (self.count / retmax) + 1
+        else:
+            num_batches = 1
+        self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
+        self.logger.info("Number of batches for esearch action: %d " %
+                         num_batches)
+        for n in range(num_batches):
+            querylog = self.esearch(self.dbname, self.query_string, n*retmax,
+                                    retmax, '', self.datetype, self.reldate,
+                                    self.mindate, self.maxdate)
+            for line in querylog:
+                if '<Id>' in line and '</Id>' in line:
+                    uid = (line[line.find('<Id>')+len('<Id>'):
+                           line.find('</Id>')])
+                    self.ids.append(uid)
+            self.logger.info("Retrieved %d UIDs" % len(self.ids))
+
+    def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate,
+                mindate, maxdate):
+        url = self.base + "esearch.fcgi"
+        self.logger.debug("url: %s" % url)
+        values = {'db': db,
+                  'term': term,
+                  'rettype': rettype,
+                  'retstart': retstart,
+                  'retmax': retmax,
+                  'datetype': datetype,
+                  'reldate': reldate,
+                  'mindate': mindate,
+                  'maxdate': maxdate}
+        data = urllib.urlencode(values)
+        self.logger.debug("data: %s" % str(data))
+        req = urllib2.Request(url, data)
+        response = urllib2.urlopen(req)
+        querylog = response.readlines()
+        response.close()
+        time.sleep(1)
+        return querylog
+
+    def epost(self, db, ids):
+        url = self.base + "epost.fcgi"
+        self.logger.debug("url_epost: %s" % url)
+        values = {'db': db,
+                  'id': ids}
+        data = urllib.urlencode(values)
+        req = urllib2.Request(url, data)
+        serverResponse = False
+        nb_trials = 0
+        while not serverResponse:
+            nb_trials += 1
+            try:
+                self.logger.debug("Try number %s for opening and readin URL %s"
+                                  % (nb_trials, url+data))
+                response = urllib2.urlopen(req)
+                querylog = response.readlines()
+                response.close()
+                serverResponse = True
+            except urllib2.HTTPError as e:
+                self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
+                self.logger.info("Retrying in 1 sec")
+                serverResponse = False
+                time.sleep(1)
+            except urllib2.URLError as e:
+                self.logger.info("urlopen error: Failed to reach a server")
+                self.logger.info("Reason :%s" % (e.reason))
+                self.logger.info("Retrying in 1 sec")
+                serverResponse = False
+                time.sleep(1)
+            except httplib.IncompleteRead as e:
+                self.logger.info("IncompleteRead error:  %s" % (e.partial))
+                self.logger.info("Retrying in 1 sec")
+                serverResponse = False
+                time.sleep(1)
+        self.logger.debug("query response:")
+        for line in querylog:
+            self.logger.debug(line.rstrip())
+            if '</QueryKey>' in line:
+                self.query_key = str(line[line.find('<QueryKey>') +
+                                     len('<QueryKey>'):line.find('</QueryKey>')
+                                     ])
+            if '</WebEnv>' in line:
+                self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):
+                                  line.find('</WebEnv>')])
+            self.logger.debug("*** epost action ***")
+            self.logger.debug("query_key: %s" % self.query_key)
+            self.logger.debug("webenv: %s" % self.webenv)
+        time.sleep(1)
+
+    def efetch(self, db, query_key, webenv):
+        url = self.base + "efetch.fcgi"
+        self.logger.debug("url_efetch: %s" % url)
+        values = {'db': db,
+                  'query_key': query_key,
+                  'webenv': webenv,
+                  'rettype': "fasta",
+                  'retmode': "text"}
+        data = urllib.urlencode(values)
+        req = urllib2.Request(url, data)
+        self.logger.debug("data: %s" % str(data))
+        serverTransaction = False
+        counter = 0
+        response_code = 0
+        while not serverTransaction:
+            counter += 1
+            self.logger.info("Server Transaction Trial:  %s" % (counter))
+            try:
+                response = urllib2.urlopen(req)
+                response_code = response.getcode()
+                fasta = response.read()
+                response.close()
+                if((response_code != 200) or
+                   ("Resource temporarily unavailable" in fasta) or
+                   ("Error" in fasta) or (not fasta.startswith(">"))):
+                    serverTransaction = False
+                    if (response_code != 200):
+                        self.logger.info("urlopen error: Response code is not\
+                                         200")
+                    elif ("Resource temporarily unavailable" in fasta):
+                        self.logger.info("Ressource temporarily unavailable")
+                    elif ("Error" in fasta):
+                        self.logger.info("Error in fasta")
+                    else:
+                        self.logger.info("Fasta doesn't start with '>'")
+                else:
+                    serverTransaction = True
+            except urllib2.HTTPError as e:
+                serverTransaction = False
+                self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
+            except urllib2.URLError as e:
+                serverTransaction = False
+                self.logger.info("urlopen error: Failed to reach a server")
+                self.logger.info("Reason :%s" % (e.reason))
+            except httplib.IncompleteRead as e:
+                serverTransaction = False
+                self.logger.info("IncompleteRead error:  %s" % (e.partial))
+            if (counter > 500):
+                serverTransaction = True
+        if (counter > 500):
+            raise QueryException({"message":
+                                  "500 Server Transaction Trials attempted for\
+                                  this batch. Aborting."})
+        fasta = self.sanitiser(self.dbname, fasta)
+        time.sleep(0.1)
+        return fasta
+
+    def sanitiser(self, db, fastaseq):
+        if(db not in "nuccore protein"):
+            return fastaseq
+        regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
+        sane_seqlist = []
+        seqlist = fastaseq.split("\n\n")
+        for seq in seqlist[:-1]:
+            fastalines = seq.split("\n")
+            if len(fastalines) < 2:
+                self.logger.info("Empty sequence for %s" %
+                                 ("|".join(fastalines[0].split("|")[:4])))
+                self.logger.info("%s download is skipped" %
+                                 ("|".join(fastalines[0].split("|")[:4])))
+                continue
+            if db == "nuccore":
+                badnuc = 0
+                for nucleotide in fastalines[1]:
+                    if nucleotide not in "ATGC":
+                        badnuc += 1
+                if float(badnuc)/len(fastalines[1]) > 0.4:
+                    self.logger.info("%s ambiguous nucleotides in %s\
+                                     or download interrupted at this offset\
+                                     | %s" % (float(badnuc)/len(fastalines[1]),
+                                              "|".join(fastalines[0].split("|")
+                                                       [:4]),
+                                              fastalines[1]))
+                    self.logger.info("%s download is skipped" %
+                                     (fastalines[0].split("|")[:4]))
+                    continue
+                """ remove spaces and trim the header to 100 chars """
+                fastalines[0] = fastalines[0].replace(" ", "_")[:100]
+                cleanseq = "\n".join(fastalines)
+                sane_seqlist.append(cleanseq)
+            elif db == "protein":
+                fastalines[0] = fastalines[0][0:100]
+                fastalines[0] = fastalines[0].replace(" ", "_")
+                fastalines[0] = fastalines[0].replace("[", "_")
+                fastalines[0] = fastalines[0].replace("]", "_")
+                fastalines[0] = fastalines[0].replace("=", "_")
+                """ because blast makedb doesn't like it """
+                fastalines[0] = fastalines[0].rstrip("_")
+                fastalines[0] = re.sub(regex, "_", fastalines[0])
+                cleanseq = "\n".join(fastalines)
+                sane_seqlist.append(cleanseq)
+        self.logger.info("clean sequences appended: %d" % (len(sane_seqlist)))
+        return "\n".join(sane_seqlist)
+
+    def get_sequences(self):
+        """
+        Total number of records from the input set to be retrieved,
+        up to a maximum of 10,000. Optionally, for a large set the value of
+        retstart can be iterated while holding retmax constant, thereby
+        downloading the entire set in batches of size retmax.
+        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+        """
+        batch_size = self.retmax_efetch
+        count = self.count
+        uids_list = self.ids
+        self.logger.info("Batch size for efetch action: %d" % batch_size)
+        self.logger.info("Number of batches for efetch action: %d" %
+                         ((count / batch_size) + 1))
+        with open(self.outname, 'w') as out:
+            for start in range(0, count, batch_size):
+                end = min(count, start+batch_size)
+                batch = uids_list[start:end]
+                if self.epost(self.dbname, ",".join(batch)) != -1:
+                    mfasta = ''
+                    while not mfasta:
+                        self.logger.info("retrieving batch %d" %
+                                         ((start / batch_size) + 1))
+                        try:
+                            mfasta = self.efetch(self.dbname, self.query_key,
+                                                 self.webenv)
+                            out.write(mfasta + '\n')
+                        except QueryException as e:
+                            self.logger.error("%s" % e.message)
+                            raise e
+
+
+LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
+LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
+LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
+
+
+def __main__():
+    """ main function """
+    parser = optparse.OptionParser(description='Retrieve data from NCBI')
+    parser.add_option('-i', dest='query_string', help='NCBI Query String')
+    parser.add_option('-o', dest='outname', help='output file name')
+    parser.add_option('-d', dest='dbname', help='database type')
+    parser.add_option('-l', '--logfile', help='log file (default=stderr)')
+    parser.add_option('--datetype', dest='datetype',
+                      choices=['mdat', 'pdat'],
+                      help='Type of date used to limit a search.\
+                            [ mdat(modification date), pdat(publication date)]\
+                            (default=pdat)', default='pdat')
+    parser.add_option('--reldate', dest='reldate',
+                      help='When reldate is set to an integer n, the search\
+                            returns only those items that have a date\
+                            specified by datetype within the last n days.')
+    parser.add_option('--maxdate', dest='maxdate',
+                      help='Date range used to limit a search result by the\
+                            date specified by datetype. These two parameters\
+                            (mindate, maxdate) must be used together to\
+                            specify an arbitrary date range. The general date\
+                            format is YYYY/MM/DD, and these variants are also\
+                            allowed: YYYY, YYYY/MM.')
+    parser.add_option('--mindate', dest='mindate',
+                      help='Date range used to limit a search result by the\
+                            date specified by datetype. These two parameters\
+                            (mindate, maxdate) must be used together to\
+                            specify an arbitrary date range. The general date\
+                            format is YYYY/MM/DD, and these variants are also\
+                            allowed: YYYY, YYYY/MM.')
+    parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO',
+                      help='logging level (default: INFO)')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+    if((options.reldate and options.maxdate) or
+       (options.reldate and options.mindate)):
+        parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\
+                     parameters")
+    if((options.mindate and not options.maxdate) or
+       (options.maxdate and not options.mindate)):
+        parser.error("mindate and maxdate must be used together")
+
+    log_level = getattr(logging, options.loglevel)
+    kwargs = {'format': LOG_FORMAT,
+              'datefmt': LOG_DATEFMT,
+              'level': log_level}
+    if options.logfile:
+        kwargs['filename'] = options.logfile
+    logging.basicConfig(**kwargs)
+    logger = logging.getLogger('data_from_NCBI')
+
+    E = Eutils(options, logger)
+    try:
+        E.retrieve()
+    except Exception as e:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fetch_fasta_from_NCBI.xml	Sun Oct 15 13:56:26 2017 -0400
@@ -0,0 +1,95 @@
+<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.1.0">
+  <description></description>
+  <command><![CDATA[
+      python '$__tool_directory__'/fetch_fasta_from_NCBI.py
+      -i "$queryString"
+      -d $dbname
+      -o '$outfilename'
+      -l '$logfile'
+      #if $date_condition.date_filter == "YES":
+          --datetype $date_condition.datetype
+          --mindate $date_condition.mindate
+          --maxdate $date_condition.maxdate
+      #end if
+  ]]></command>
+
+  <inputs>
+    <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]">
+      <sanitizer>
+        <valid initial="string.printable">
+          <remove value="&quot;"/>
+          <remove value="\"/>
+        </valid>
+        <mapping initial="none">
+          <add source="&quot;" target="\&quot;"/>
+          <add source="\" target="\\"/>
+        </mapping>
+      </sanitizer>
+    </param>
+    <param name="dbname" type="select" label="NCBI database">
+      <option value="nuccore">Nucleotide</option>
+      <option value="protein">Protein</option>
+    </param>
+    <conditional name="date_condition">
+        <param name="date_filter" type="boolean" label="Filter the sequences by date?" truevalue="YES" falsevalue="NO" checked="false"/>
+        <when value="YES">
+            <param name="datetype" type="select">
+                <option value="pdat">Publication date</option>
+                <option value="mdat">Modification date</option>
+            </param>
+            <param name="mindate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/>
+            <param name="maxdate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/>
+        </when>
+        <when value="NO"/>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data name="outfilename" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" />
+    <data format="txt" name="logfile" label="${tool.name}: log"/>
+  </outputs>
+  <tests>
+    <test>
+        <param name="queryString" value="9629650[gi]" />
+        <param name="dbname" value="nuccore" />
+        <output name="outfilename" ftype="fasta" file="output.fa" />
+        <!--  <output name="logfile" ftype="txt" file="log.txt" />  log.txt changes with timestamp. removed to pass the  test -->
+    </test>
+    <test>
+        <param name="queryString" value="CU929326[Accession]" />
+        <param name="dbname" value="nuccore" />
+        <param name="date_filter" value="YES"/>
+        <param name="datetype" value="pdat"/>
+        <param name="mindate" value="2008/09"/>
+        <param name="maxdate" value="2008/09"/>
+        <output name="outfilename" ftype="fasta" file="zebra_output.fa" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query.
+
+The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose
+
+See `Entrez help`_ for explanation of query formats
+
+Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields.
+
+Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset)
+
+**Acknowledgments**
+
+This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.
+
+It is Copyright © 2014-2015 `CNRS and University Pierre et Marie Curie`_ and is released under the `MIT license`_.
+
+.. _Entrez help: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options
+.. _get_fasta_from_taxon: https://toolshed.g2.bx.psu.edu/view/crs4/get_fasta_from_taxon
+.. _CNRS and University Pierre et Marie Curie: http://www.ibps.upmc.fr/en
+.. _MIT license: http://opensource.org/licenses/MIT
+
+  </help>
+  <citations>
+      <citation type="doi">10.1186/1471-2105-14-73</citation>
+  </citations>
+</tool>
--- a/retrieve_fasta_from_NCBI.py	Thu Aug 24 08:18:34 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,411 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-From a taxonomy ID retrieves all the nucleotide sequences
-It returns a multiFASTA nuc/prot file
-
-Entrez Database  UID common name  E-utility Database Name
-Nucleotide       GI number        nuccore
-Protein          GI number        protein
-
-Retrieve strategy:
-
-esearch to get total number of UIDs (count)
-esearch to get UIDs in batches
-loop untile end of UIDs list:
-  epost to put a batch of UIDs in the history server
-  efetch to retrieve info from previous post
-
-retmax of efetch is 1/10 of declared value from NCBI
-
-queries are 1 sec delayed, to satisfy NCBI guidelines
-(more than what they request)
-"""
-import sys
-import logging
-import optparse
-import time
-import urllib
-import urllib2
-import httplib
-import re
-
-
-class QueryException(Exception):
-    pass
-
-
-class Eutils:
-
-    def __init__(self, options, logger):
-        """
-        Initialize retrieval parameters
-        """
-        self.logger = logger
-        self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-        self.query_string = options.query_string
-        self.dbname = options.dbname
-        if options.outname:
-            self.outname = options.outname
-        else:
-            self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
-        self.ids = []
-        self.retmax_esearch = 100000
-        self.retmax_efetch = 500
-        self.count = 0
-        self.webenv = ""
-        self.query_key = ""
-        self.datetype = options.datetype
-        if options.reldate:
-            self.reldate = options.reldate
-        else:
-            self.reldate = ''
-        if options.mindate:
-            self.mindate = options.mindate
-        else:
-            self.mindate = ''
-        if options.maxdate:
-            self.maxdate = options.maxdate
-        else:
-            self.maxdate = ''
-
-    def retrieve(self):
-        """
-        Retrieve the fasta sequences corresponding to the query
-        """
-        self.get_count_value()
-
-        # If no UIDs are found exit script
-        if self.count > 0:
-            self.get_uids_list()
-            try:
-                self.get_sequences()
-            except QueryException as e:
-                self.logger.error("Exiting script.")
-                raise e
-        else:
-            self.logger.error("No UIDs were found. Exiting script.")
-            raise Exception("")
-
-    def get_count_value(self):
-        """
-        just to retrieve Count (number of UIDs)
-        Total number of UIDs from the retrieved set to be shown in the XML
-        output (default=20). By default, ESearch only includes the first 20
-        UIDs retrieved in the XML output. If usehistory is set to 'y',
-        the remainder of the retrieved set will be stored on the History server
-
-        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
-        """
-        self.logger.info("retrieving data from %s" % self.base)
-        self.logger.info("for Query: %s and database: %s" %
-                         (self.query_string, self.dbname))
-        querylog = self.esearch(self.dbname, self.query_string, '', '',
-                                "count", self.datetype, self.reldate,
-                                self.mindate, self.maxdate)
-        self.logger.debug("Query response:")
-        for line in querylog:
-            self.logger.debug(line.rstrip())
-            if '</Count>' in line:
-                self.count = int(line[line.find('<Count>')+len('<Count>'):
-                                 line.find('</Count>')])
-        self.logger.info("Found %d UIDs" % self.count)
-
-    def get_uids_list(self):
-        """
-        Increasing retmax allows more of the retrieved UIDs to be included in
-        the XML output, up to a maximum of 100,000 records.
-        from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
-        """
-        retmax = self.retmax_esearch
-        if (self.count > retmax):
-            num_batches = (self.count / retmax) + 1
-        else:
-            num_batches = 1
-        self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
-        self.logger.info("Number of batches for esearch action: %d " %
-                         num_batches)
-        for n in range(num_batches):
-            querylog = self.esearch(self.dbname, self.query_string, n*retmax,
-                                    retmax, '', self.datetype, self.reldate,
-                                    self.mindate, self.maxdate)
-            for line in querylog:
-                if '<Id>' in line and '</Id>' in line:
-                    uid = (line[line.find('<Id>')+len('<Id>'):
-                           line.find('</Id>')])
-                    self.ids.append(uid)
-            self.logger.info("Retrieved %d UIDs" % len(self.ids))
-
-    def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate,
-                mindate, maxdate):
-        url = self.base + "esearch.fcgi"
-        self.logger.debug("url: %s" % url)
-        values = {'db': db,
-                  'term': term,
-                  'rettype': rettype,
-                  'retstart': retstart,
-                  'retmax': retmax,
-                  'datetype': datetype,
-                  'reldate': reldate,
-                  'mindate': mindate,
-                  'maxdate': maxdate}
-        data = urllib.urlencode(values)
-        self.logger.debug("data: %s" % str(data))
-        req = urllib2.Request(url, data)
-        response = urllib2.urlopen(req)
-        querylog = response.readlines()
-        response.close()
-        time.sleep(1)
-        return querylog
-
-    def epost(self, db, ids):
-        url = self.base + "epost.fcgi"
-        self.logger.debug("url_epost: %s" % url)
-        values = {'db': db,
-                  'id': ids}
-        data = urllib.urlencode(values)
-        req = urllib2.Request(url, data)
-        serverResponse = False
-        nb_trials = 0
-        while not serverResponse:
-            nb_trials += 1
-            try:
-                self.logger.debug("Try number %s for opening and readin URL %s"
-                                  % (nb_trials, url+data))
-                response = urllib2.urlopen(req)
-                querylog = response.readlines()
-                response.close()
-                serverResponse = True
-            except urllib2.HTTPError as e:
-                self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
-                self.logger.info("Retrying in 1 sec")
-                serverResponse = False
-                time.sleep(1)
-            except urllib2.URLError as e:
-                self.logger.info("urlopen error: Failed to reach a server")
-                self.logger.info("Reason :%s" % (e.reason))
-                self.logger.info("Retrying in 1 sec")
-                serverResponse = False
-                time.sleep(1)
-            except httplib.IncompleteRead as e:
-                self.logger.info("IncompleteRead error:  %s" % (e.partial))
-                self.logger.info("Retrying in 1 sec")
-                serverResponse = False
-                time.sleep(1)
-        self.logger.debug("query response:")
-        for line in querylog:
-            self.logger.debug(line.rstrip())
-            if '</QueryKey>' in line:
-                self.query_key = str(line[line.find('<QueryKey>') +
-                                     len('<QueryKey>'):line.find('</QueryKey>')
-                                     ])
-            if '</WebEnv>' in line:
-                self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):
-                                  line.find('</WebEnv>')])
-            self.logger.debug("*** epost action ***")
-            self.logger.debug("query_key: %s" % self.query_key)
-            self.logger.debug("webenv: %s" % self.webenv)
-        time.sleep(1)
-
-    def efetch(self, db, query_key, webenv):
-        url = self.base + "efetch.fcgi"
-        self.logger.debug("url_efetch: %s" % url)
-        values = {'db': db,
-                  'query_key': query_key,
-                  'webenv': webenv,
-                  'rettype': "fasta",
-                  'retmode': "text"}
-        data = urllib.urlencode(values)
-        req = urllib2.Request(url, data)
-        self.logger.debug("data: %s" % str(data))
-        serverTransaction = False
-        counter = 0
-        response_code = 0
-        while not serverTransaction:
-            counter += 1
-            self.logger.info("Server Transaction Trial:  %s" % (counter))
-            try:
-                response = urllib2.urlopen(req)
-                response_code = response.getcode()
-                fasta = response.read()
-                response.close()
-                if((response_code != 200) or
-                   ("Resource temporarily unavailable" in fasta) or
-                   ("Error" in fasta) or (not fasta.startswith(">"))):
-                    serverTransaction = False
-                    if (response_code != 200):
-                        self.logger.info("urlopen error: Response code is not\
-                                         200")
-                    elif ("Resource temporarily unavailable" in fasta):
-                        self.logger.info("Ressource temporarily unavailable")
-                    elif ("Error" in fasta):
-                        self.logger.info("Error in fasta")
-                    else:
-                        self.logger.info("Fasta doesn't start with '>'")
-                else:
-                    serverTransaction = True
-            except urllib2.HTTPError as e:
-                serverTransaction = False
-                self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
-            except urllib2.URLError as e:
-                serverTransaction = False
-                self.logger.info("urlopen error: Failed to reach a server")
-                self.logger.info("Reason :%s" % (e.reason))
-            except httplib.IncompleteRead as e:
-                serverTransaction = False
-                self.logger.info("IncompleteRead error:  %s" % (e.partial))
-            if (counter > 500):
-                serverTransaction = True
-        if (counter > 500):
-            raise QueryException({"message":
-                                  "500 Server Transaction Trials attempted for\
-                                  this batch. Aborting."})
-        fasta = self.sanitiser(self.dbname, fasta)
-        time.sleep(0.1)
-        return fasta
-
-    def sanitiser(self, db, fastaseq):
-        if(db not in "nuccore protein"):
-            return fastaseq
-        regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
-        sane_seqlist = []
-        seqlist = fastaseq.split("\n\n")
-        for seq in seqlist[:-1]:
-            fastalines = seq.split("\n")
-            if len(fastalines) < 2:
-                self.logger.info("Empty sequence for %s" %
-                                 ("|".join(fastalines[0].split("|")[:4])))
-                self.logger.info("%s download is skipped" %
-                                 ("|".join(fastalines[0].split("|")[:4])))
-                continue
-            if db == "nuccore":
-                badnuc = 0
-                for nucleotide in fastalines[1]:
-                    if nucleotide not in "ATGC":
-                        badnuc += 1
-                if float(badnuc)/len(fastalines[1]) > 0.4:
-                    self.logger.info("%s ambiguous nucleotides in %s\
-                                     or download interrupted at this offset\
-                                     | %s" % (float(badnuc)/len(fastalines[1]),
-                                              "|".join(fastalines[0].split("|")
-                                                       [:4]),
-                                              fastalines[1]))
-                    self.logger.info("%s download is skipped" %
-                                     (fastalines[0].split("|")[:4]))
-                    continue
-                """ remove spaces and trim the header to 100 chars """
-                fastalines[0] = fastalines[0].replace(" ", "_")[:100]
-                cleanseq = "\n".join(fastalines)
-                sane_seqlist.append(cleanseq)
-            elif db == "protein":
-                fastalines[0] = fastalines[0][0:100]
-                fastalines[0] = fastalines[0].replace(" ", "_")
-                fastalines[0] = fastalines[0].replace("[", "_")
-                fastalines[0] = fastalines[0].replace("]", "_")
-                fastalines[0] = fastalines[0].replace("=", "_")
-                """ because blast makedb doesn't like it """
-                fastalines[0] = fastalines[0].rstrip("_")
-                fastalines[0] = re.sub(regex, "_", fastalines[0])
-                cleanseq = "\n".join(fastalines)
-                sane_seqlist.append(cleanseq)
-        self.logger.info("clean sequences appended: %d" % (len(sane_seqlist)))
-        return "\n".join(sane_seqlist)
-
-    def get_sequences(self):
-        """
-        Total number of records from the input set to be retrieved,
-        up to a maximum of 10,000. Optionally, for a large set the value of
-        retstart can be iterated while holding retmax constant, thereby
-        downloading the entire set in batches of size retmax.
-        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
-        """
-        batch_size = self.retmax_efetch
-        count = self.count
-        uids_list = self.ids
-        self.logger.info("Batch size for efetch action: %d" % batch_size)
-        self.logger.info("Number of batches for efetch action: %d" %
-                         ((count / batch_size) + 1))
-        with open(self.outname, 'w') as out:
-            for start in range(0, count, batch_size):
-                end = min(count, start+batch_size)
-                batch = uids_list[start:end]
-                if self.epost(self.dbname, ",".join(batch)) != -1:
-                    mfasta = ''
-                    while not mfasta:
-                        self.logger.info("retrieving batch %d" %
-                                         ((start / batch_size) + 1))
-                        try:
-                            mfasta = self.efetch(self.dbname, self.query_key,
-                                                 self.webenv)
-                            out.write(mfasta + '\n')
-                        except QueryException as e:
-                            self.logger.error("%s" % e.message)
-                            raise e
-
-
-LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
-LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
-LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
-
-
-def __main__():
-    """ main function """
-    parser = optparse.OptionParser(description='Retrieve data from NCBI')
-    parser.add_option('-i', dest='query_string', help='NCBI Query String')
-    parser.add_option('-o', dest='outname', help='output file name')
-    parser.add_option('-d', dest='dbname', help='database type')
-    parser.add_option('-l', '--logfile', help='log file (default=stderr)')
-    parser.add_option('--datetype', dest='datetype',
-                      choices=['mdat', 'edat', 'pdat'],
-                      help='Type of date used to limit a search.\
-                            [ mdat(modification date), pdat(publication date),\
-                            edat(entrez date)] (default=pdat)', default='pdat')
-    parser.add_option('--reldate', dest='reldate',
-                      help='When reldate is set to an integer n, the search\
-                            returns only those items that have a date\
-                            specified by datetype within the last n days.')
-    parser.add_option('--maxdate', dest='maxdate',
-                      help='Date range used to limit a search result by the\
-                            date specified by datetype. These two parameters\
-                            (mindate, maxdate) must be used together to\
-                            specify an arbitrary date range. The general date\
-                            format is YYYY/MM/DD, and these variants are also\
-                            allowed: YYYY, YYYY/MM.')
-    parser.add_option('--mindate', dest='mindate',
-                      help='Date range used to limit a search result by the\
-                            date specified by datetype. These two parameters\
-                            (mindate, maxdate) must be used together to\
-                            specify an arbitrary date range. The general date\
-                            format is YYYY/MM/DD, and these variants are also\
-                            allowed: YYYY, YYYY/MM.')
-    parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO',
-                      help='logging level (default: INFO)')
-    (options, args) = parser.parse_args()
-    if len(args) > 0:
-        parser.error('Wrong number of arguments')
-    if((options.reldate and options.maxdate) or
-       (options.reldate and options.mindate)):
-        parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\
-                     parameters")
-    if((options.mindate and not options.maxdate) or
-       (options.maxdate and not options.mindate)):
-        parser.error("mindate and maxdate must be used together")
-
-    log_level = getattr(logging, options.loglevel)
-    kwargs = {'format': LOG_FORMAT,
-              'datefmt': LOG_DATEFMT,
-              'level': log_level}
-    if options.logfile:
-        kwargs['filename'] = options.logfile
-    logging.basicConfig(**kwargs)
-    logger = logging.getLogger('data_from_NCBI')
-
-    E = Eutils(options, logger)
-    try:
-        E.retrieve()
-    except Exception as e:
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    __main__()
--- a/retrieve_fasta_from_NCBI.xml	Thu Aug 24 08:18:34 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,96 +0,0 @@
-<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.0.0">
-  <description></description>
-  <command><![CDATA[
-      python '$__tool_directory__'/retrieve_fasta_from_NCBI.py
-      -i "$queryString"
-      -d $dbname
-      -o '$outfilename'
-      -l '$logfile'
-      #if $date_condition.date_filter == "YES":
-          --datetype $date_condition.datetype
-          --mindate $date_condition.mindate
-          --maxdate $date_condition.maxdate
-      #end if
-  ]]></command>
-
-  <inputs>
-    <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]">
-      <sanitizer>
-        <valid initial="string.printable">
-          <remove value="&quot;"/>
-          <remove value="\"/>
-        </valid>
-        <mapping initial="none">
-          <add source="&quot;" target="\&quot;"/>
-          <add source="\" target="\\"/>
-        </mapping>
-      </sanitizer>
-    </param>
-    <param name="dbname" type="select" label="NCBI database">
-      <option value="nuccore">Nucleotide</option>
-      <option value="protein">Protein</option>
-    </param>
-    <conditional name="date_condition">
-        <param name="date_filter" type="boolean" label="Filter the sequences by date?" truevalue="YES" falsevalue="NO" checked="false"/>
-        <when value="YES">
-            <param name="datetype" type="select">
-                <option value="pdat">Publication date</option>
-                <option value="edat">Entrez date</option>
-                <option value="mdat">Modification date</option>
-            </param>
-            <param name="mindate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/>
-            <param name="maxdate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/>
-        </when>
-        <when value="NO"/>
-    </conditional>
-  </inputs>
-  <outputs>
-    <data name="outfilename" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" />
-    <data format="txt" name="logfile" label="${tool.name}: log"/>
-  </outputs>
-  <tests>
-    <test>
-        <param name="queryString" value="9629650[gi]" />
-        <param name="dbname" value="nuccore" />
-        <output name="outfilename" ftype="fasta" file="output.fa" />
-        <!--  <output name="logfile" ftype="txt" file="log.txt" />  log.txt changes with timestamp. removed to pass the  test -->
-    </test>
-    <test>
-        <param name="queryString" value="CU929326[Accession]" />
-        <param name="dbname" value="nuccore" />
-        <param name="date_filter" value="YES"/>
-        <param name="datetype" value="pdat"/>
-        <param name="mindate" value="2008/09"/>
-        <param name="maxdate" value="2008/09"/>
-        <output name="outfilename" ftype="fasta" file="zebra_output.fa" />
-    </test>
-  </tests>
-  <help>
-**What it does**
-
-This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query.
-
-The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose
-
-See `Entrez help`_ for explanation of query formats
-
-Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields.
-
-Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset)
-
-**Acknowledgments**
-
-This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.
-
-It is Copyright © 2014-2015 `CNRS and University Pierre et Marie Curie`_ and is released under the `MIT license`_.
-
-.. _Entrez help: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options
-.. _get_fasta_from_taxon: https://toolshed.g2.bx.psu.edu/view/crs4/get_fasta_from_taxon
-.. _CNRS and University Pierre et Marie Curie: http://www.ibps.upmc.fr/en
-.. _MIT license: http://opensource.org/licenses/MIT
-
-  </help>
-  <citations>
-      <citation type="doi">10.1186/1471-2105-14-73</citation>
-  </citations>
-</tool>