Repository 'fetch_fasta_from_ncbi'
hg clone https://toolshed.g2.bx.psu.edu/repos/drosofff/fetch_fasta_from_ncbi

Changeset 0:0bdc5a73c8d1 (2015-06-21)
Next changeset 1:79cb7620843d (2015-10-28)
Commit message:
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
added:
retrieve_fasta_from_NCBI.py
retrieve_fasta_from_NCBI.xml
test-data/output.fa
b
diff -r 000000000000 -r 0bdc5a73c8d1 retrieve_fasta_from_NCBI.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/retrieve_fasta_from_NCBI.py Sun Jun 21 14:29:45 2015 -0400
[
b'@@ -0,0 +1,279 @@\n+#!/usr/bin/env python\n+# -*- coding: utf-8 -*-\n+"""\n+From a taxonomy ID retrieves all the nucleotide sequences\n+It returns a multiFASTA nuc/prot file\n+\n+Entrez Database  UID common name  E-utility Database Name\n+Nucleotide       GI number        nuccore\n+Protein          GI number        protein\n+\n+Retrieve strategy:\n+\n+esearch to get total number of UIDs (count)\n+esearch to get UIDs in batches\n+loop untile end of UIDs list:\n+  epost to put a batch of UIDs in the history server\n+  efetch to retrieve info from previous post\n+\n+retmax of efetch is 1/10 of declared value from NCBI\n+\n+queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)\n+\n+\n+python get_fasta_from_taxon.py -i 1638 -o test.out -d protein\n+python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs\n+"""\n+import sys\n+import logging\n+import optparse\n+import time\n+import urllib\n+import urllib2\n+import httplib\n+import re\n+class Eutils:\n+\n+    def __init__(self, options, logger):\n+        self.logger = logger\n+        self.base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"\n+        self.query_string = options.query_string\n+        self.dbname = options.dbname\n+        if options.outname:\n+            self.outname = options.outname\n+        else:\n+            self.outname = \'NCBI_download\' + \'.\' + self.dbname + \'.fasta\'\n+        self.ids = []\n+        self.retmax_esearch = 100000\n+        self.retmax_efetch = 1000\n+        self.count = 0\n+        self.webenv = ""\n+        self.query_key = ""\n+\n+    def retrieve(self):\n+        """ """\n+        self.get_count_value()\n+        self.get_uids_list()\n+        self.get_sequences()\n+\n+    def get_count_value(self):\n+        """\n+        just to retrieve Count (number of UIDs)\n+        Total number of UIDs from the retrieved set to be shown in the XML\n+        output (default=20). By default, ESearch only includes the first 20\n+        UIDs retrieved in the XML output. If usehistory is set to \'y\',\n+        the remainder of the retrieved set will be stored on the History server;\n+\n+        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch\n+        """\n+        self.logger.info("retrieving data from %s" % self.base)\n+        self.logger.info("for Query: %s and database: %s" %\n+                         (self.query_string, self.dbname))\n+        querylog = self.esearch(self.dbname, self.query_string, \'\', \'\', "count")\n+        self.logger.debug("Query response:")\n+        for line in querylog:\n+            self.logger.debug(line.rstrip())\n+            if \'</Count>\' in line:\n+                self.count = int(line[line.find(\'<Count>\')+len(\'<Count>\') : line.find(\'</Count>\')])\n+        self.logger.info("Founded %d UIDs" % self.count)\n+\n+    def get_uids_list(self):\n+        """\n+        Increasing retmax allows more of the retrieved UIDs to be included in the XML output,\n+        up to a maximum of 100,000 records.\n+        from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch\n+        """\n+        retmax = self.retmax_esearch\n+        if (self.count > retmax):\n+            num_batches = (self.count / retmax) + 1\n+        else:\n+            num_batches = 1\n+        self.logger.info("Batch size for esearch action: %d UIDs" % retmax)\n+        self.logger.info("Number of batches for esearch action: %d " % num_batches)\n+        for n in range(num_batches):\n+            querylog = self.esearch(self.dbname, self.query_string, n*retmax, retmax, \'\')\n+            for line in querylog:\n+                if \'<Id>\' in line and \'</Id>\' in line:\n+                    uid = (line[line.find(\'<Id>\')+len(\'<Id>\') : line.find(\'</Id>\')])\n+                    self.ids.append(uid)\n+            self.logger.info("Retrieved %d UIDs" % len(self.ids))\n+\n+    def esearch(self, db, term, retstart, retmax, rettype):\n+        url = self.base + "esearch.fcgi"\n+        self.logger.debug("url: %s" % url)\n+        values = {\'db\': db,\n+                  \'term\': term,\n+                '..b'if db == "nuccore":\n+                badnuc = 0\n+                for nucleotide in fastalines[1]:\n+                    if nucleotide not in "ATGC":\n+                        badnuc += 1\n+                if float(badnuc)/len(fastalines[1]) > 0.4:\n+                    self.logger.info("%s ambiguous nucleotides in %s or download interrupted at this offset | %s" % ( float(badnuc)/len(fastalines[1]), "|".join(fastalines[0].split("|")[:4]), fastalines[1]) )\n+                    self.logger.info("%s download is skipped" % (fastalines[0].split("|")[:4]) )\n+                    continue\n+                fastalines[0] = fastalines[0].replace(" ","_")[:100] # remove spaces and trim the header to 100 chars\n+                cleanseq = "\\n".join(fastalines)\n+                sane_seqlist.append(cleanseq)\n+            elif db == "protein":\n+                fastalines[0] = fastalines[0][0:100]\n+                fastalines[0] = fastalines[0].replace(" ", "_")\n+                fastalines[0] = fastalines[0].replace("[", "_")\n+                fastalines[0] = fastalines[0].replace("]", "_")\n+                fastalines[0] = fastalines[0].replace("=", "_")\n+                fastalines[0] = fastalines[0].rstrip("_") # because blast makedb doesn\'t like it \n+                fastalines[0] = re.sub(regex, "_", fastalines[0])\n+                cleanseq = "\\n".join(fastalines)\n+                sane_seqlist.append(cleanseq)\n+        self.logger.info("clean sequences appended: %d" % (len(sane_seqlist) ) )\n+        return "\\n".join(sane_seqlist)\n+\n+    def get_sequences(self):\n+        """\n+        Total number of records from the input set to be retrieved, up to a maximum\n+        of 10,000. Optionally, for a large set the value of retstart can be iterated\n+        while holding retmax constant, thereby downloading the entire set in batches\n+        of size retmax.\n+        \n+        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch\n+        \n+        """\n+        batch_size = self.retmax_efetch\n+        count = self.count\n+        uids_list = self.ids\n+        self.logger.info("Batch size for efetch action: %d" % batch_size)\n+        self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1))\n+        with open(self.outname, \'w\') as out:\n+            for start in range(0, count, batch_size):\n+                end = min(count, start+batch_size)\n+                batch = uids_list[start:end]\n+                self.epost(self.dbname, ",".join(batch))\n+                mfasta = \'\'\n+                while not mfasta:\n+                    self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))\n+                    mfasta = self.efetch(self.dbname, self.query_key, self.webenv)\n+                out.write(mfasta + \'\\n\')\n+\n+\n+LOG_FORMAT = \'%(asctime)s|%(levelname)-8s|%(message)s\'\n+LOG_DATEFMT = \'%Y-%m-%d %H:%M:%S\'\n+LOG_LEVELS = [\'DEBUG\', \'INFO\', \'WARNING\', \'ERROR\', \'CRITICAL\']\n+\n+\n+def __main__():\n+    """ main function """\n+    parser = optparse.OptionParser(description=\'Retrieve data from NCBI\')\n+    parser.add_option(\'-i\', dest=\'query_string\', help=\'NCBI Query String\')\n+    parser.add_option(\'-o\', dest=\'outname\', help=\'output file name\')\n+    parser.add_option(\'-l\', \'--logfile\', help=\'log file (default=stderr)\')\n+    parser.add_option(\'--loglevel\', choices=LOG_LEVELS, default=\'INFO\', help=\'logging level (default: INFO)\')\n+    parser.add_option(\'-d\', dest=\'dbname\', help=\'database type\')\n+    (options, args) = parser.parse_args()\n+    if len(args) > 0:\n+        parser.error(\'Wrong number of arguments\')\n+    \n+    log_level = getattr(logging, options.loglevel)\n+    kwargs = {\'format\': LOG_FORMAT,\n+              \'datefmt\': LOG_DATEFMT,\n+              \'level\': log_level}\n+    if options.logfile:\n+        kwargs[\'filename\'] = options.logfile\n+    logging.basicConfig(**kwargs)\n+    logger = logging.getLogger(\'data_from_NCBI\')\n+    \n+    E = Eutils(options, logger)\n+    E.retrieve()\n+\n+\n+if __name__ == "__main__":\n+    __main__()\n'
b
diff -r 000000000000 -r 0bdc5a73c8d1 retrieve_fasta_from_NCBI.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/retrieve_fasta_from_NCBI.xml Sun Jun 21 14:29:45 2015 -0400
[
@@ -0,0 +1,64 @@
+<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="0.9.2">
+  <description></description>
+  <command interpreter="python">retrieve_fasta_from_NCBI.py -i "$queryString" -d $dbname -o $outfilename -l $logfile </command>
+
+  <inputs>
+    <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]">
+      <sanitizer>
+        <valid initial="string.printable">
+          <remove value="&quot;"/>
+          <remove value="\"/>
+        </valid>
+        <mapping initial="none">
+          <add source="&quot;" target="\&quot;"/>
+          <add source="\" target="\\"/>
+        </mapping>
+      </sanitizer>
+    </param>
+    <param name="dbname" type="select" label="NCBI database">
+      <option value="nuccore">Nucleotide</option>
+      <option value="protein">Protein</option>
+<!--      <option value="pubmed">Pubmed (experimental)</option> -->
+    </param>
+  </inputs>
+  <outputs>
+    <data name="outfilename" format="fasta" label="${tool.name} on ${on_string}: queryString${queryString.value}.${dbname.value_label}.fasta" />
+    <data format="txt" name="logfile" label="${tool.name} on ${on_string}: log"/>
+  </outputs>
+  <tests>
+    <test>
+        <param name="queryString" value="9629650[gi]" />
+        <param name="dbname" value="nuccore" />
+        <output name="outfilename" ftype="fasta" file="output.fa" />
+        <!--  <output name="logfile" ftype="txt" file="log.txt" />  log.txt changes with timestamp. removed to pass the  test -->
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query.
+
+The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose
+
+See `Entrez help`_ for explanation of query formats
+
+Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields.
+
+Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset)
+
+**Acknowledgments**
+
+This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.
+
+It is Copyright © 2014-2015 `CNRS and University Pierre et Marie Curie`_ and is released under the `MIT license`_.
+
+.. _Entrez help: http://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options
+.. _get_fasta_from_taxon: https://toolshed.g2.bx.psu.edu/view/crs4/get_fasta_from_taxon
+.. _CNRS and University Pierre et Marie Curie: http://www.ibps.upmc.fr/en
+.. _MIT license: http://opensource.org/licenses/MIT
+
+  </help>
+  <citations>
+      <citation type="doi">10.1186/1471-2105-14-73</citation>
+  </citations>
+</tool>
b
diff -r 000000000000 -r 0bdc5a73c8d1 test-data/output.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.fa Sun Jun 21 14:29:45 2015 -0400
b
b'@@ -0,0 +1,134 @@\n+>gi|9629650|ref|NC_001834.1|_Drosophila_C_virus,_complete_genome\n+TTTATATCGTGTGTACATATAAATATGTACACACGGCTTTTAGGTAGAATATTGTTTTCAATGTTGATTT\n+TAAAGGTAACTTTGGTTATTATGCTTTACGGTTTTCATTGTTGATGGTATTTGTGGCCTGCGGTCCCTAA\n+TTGTTGAATTATTTATTCTGATACGTTGTTTTCATTGTTGATGGTAAGGATTCTTATTTTGAAGTGGTTT\n+TTCAGAAGATAACTCTAAATATGAATTATGCCTTATTGTTTTCAATGTTGATGGCCTTCGTTTAAATACT\n+CTTTGTTAATGACGGTAATCAAAGATTACATCTCAAACTTAGATTAATATTTTTAAGTAGGGTATACTGA\n+GTTAGTCCTCTCTCTTTACTGATTTTGATATCTGGTAATTGACTTCGAAGAAAGATGCGTCTTTTGGATT\n+TGTAATGACTGGGCCTTAAGTTCATAGGTGTTATTACATGGAGGAACACATTACTTTGGTTGATGATGAT\n+GTTTTGATGATGACTTTCAATGTATGTGCTTATGTTAAGCCTGACATAAGAACTTACTAGTTTGCATAAT\n+GCAAAGGGTTAGTATATGATTTTTAGTATGTGGATTTTGACACTGCCTTTGATTAGGATGTGTGAATGAT\n+TTTGAAACATATTAAGATGTTTATACGAGCGTGTTGTTTACTATTTTCAGGATATGTGGAAGCGGTTGTG\n+TATGATCTATACGCACATTTAGTTCCCAGAGGGCGTTGTCGTCTCCCCCTAAGCAAGGGAGAAACACGTG\n+GCACATGATCTTGCGCTTAACGATAAAAATGGAATCTGATAAAAGTATGGCCTGTTTAAATAGAATTTTG\n+ATGAATAAGATGATGTTTGTGGAAGATAAGATCTCTACCCTTAAGATGGTTGCTGATTATTATCAAAAAG\n+AAGTAAAGTATGATTTTGATGCAGTTGAATCTCCCCGTGAGGCACCTGTATTTAGATGTACTTGTCGATT\n+CCTTGGTTATACCATTATGACTCAAGGCATCGGTAAGAAGAATCCGAAACAGGAAGCTGCACGTCAGATG\n+TTGCTCTTGTTATCAGGAGATGTTGAGACTAACCCTGGACCCGTTCAATCGCGCCCCGTGTATTATCGTT\n+ACAACGACCCTAGATATACACGGTTGGAAAAAGCTATTGAACGTCGAGACGATAAAATTAAAACATTAAT\n+TAAAGAGTTGCGTCGACAAATCAAAAATAGGAAAATTTATTCCCAAGGAATGTTTGATAAATTAACTAAA\n+CAAATTTCTGATGGGATAAAAGATGGTGTTGGCTCTGAACAGATGAATGGAAATTTGACTCGTATTTGTG\n+ATTTCCTAGAGAACACTCTTCCTGGGTTACAAGCAAATATTCAAGCCACTGTGATTGATACAACAGACAA\n+ATATGTTTCTTTAAAAGAGGATATTATGAAGATTGTTTTAGTGATATTGCTTGTTCGTCTTTTAATGGTT\n+TGGAAGAAGTATCGTGCTTCTCTGTGTGTTATTTTAATCTTTATTTTTAAATTTTATGGATTCGATCAAA\n+AGTTGATTGATTTAATTATGGATTTGAAGAATAAAATATTTTCACAGGGTGCATTGGAAGATACAGTTGA\n+GGAGGTTGTATATCATCCTTGGTTCCATACGTGTGGAAAAATCATCTTTGCGGTTATGGCTTTCTTAACA\n+ATTAAGAAAATTCCTGGTAAACAGGATTGGGATAGTTACATAACACGTTTAGATCGTATCCCAAAATCTA\n+TTGAGGGAGCTAAAAAGATCACTGATTACTGTTCAGAATATTTTAATATTGCTAATGATCAGATCAAGAT\n+GATGGTTCTTGGAAAGACTAAAGAAGAATTGCAACGTGCTAATGGACTATATGGAGAAATTCAAGCTTGG\n+GCTCAAGAGGTTCGCCAGTATTTGGAATTGGATCAACGGAATAAAATTGATCTAGATACTGAAACCGCAA\n+ATCGTGTTGAACAACTTTGGATAAAGGGCTTGAAATTCAAGAGTGAACCCCTTTTGAGTAAGGAAATGTC\n+AGCTTTAGTTCATACAACTCTTTTACCAGCTAAGCAATTGTACGAGTATGTATCGTGTTCTCCTGTTAAA\n+GGGGGAGGACCACGTATGCGTCCAATTTGTTTATGGTTGGTAGGTGAATCAGGAGTTGGTAAGACTGAAA\n+TGGTATATCCATTGTGCATTGATGTTCTTCGGGAAATGGGGATGATTAAGAAAGATGATTTTCATCATCA\n+AGTTTATGGTCGTCAAGTTGAAACTGAATTCTGGGATGGTTATAAAGGACAGAAAATTGTCATTTATGAT\n+GATGCATTTCAGAAGAAAGATGACAAAACAGCAGCTAACCCAGAAATTTTTGAGGTTATTCGCTCTTGCA\n+ACACTTTTCCTCAGCATTTACATATGGCAGCTCTTCATGATAAAAATACTTTTTCTGCTGCTGAATTACT\n+CTTATATACCACTAATGATTATAATGTTAAGCTGGAATCTATTACTTTTCCCGATGCTTTCTTTAATCGT\n+ATGGGCGATATGGCTTATAAAGTTAGTCCTAAGAAAGAGTATGGTATTGAAACCGAGAAAGGGAATTCAG\n+GTAAAACTTATTTAAAATTGGATAAGAGTAAATTGGACAAAACAAAAGCTATTGACCTTTCAGTGTATGA\n+ATTCCAAAAAATTGTACGTGACGAGAAAAGTGATGCAGGTTGGATTGATTCTGGATCACCCTTGGACTAT\n+GAAGATTTTGCTAAATTAGTGTGTTCAAAATGGAAAGAAGCGAAACAATCTTCAATGAATAAATTGAAAT\n+TTTTGGAAGAATATGCTATTCGTGCTCAGGTTGGATCAGAAGAAAATTCTGAATATGGTGATTGTATAGA\n+TTTTGTCGATGATATTGCCAAACGCTTACAAAAAGGTGAAACTCTTGAAGAAATAGAGTTTGATTATGCC\n+TCAGATCCAGAGATGTTTACTCAATACTATCATTTTAAATCTACAATTAAACCGGCATCGCGTTGGCAGA\n+AGTATAAGGATCGGATGGACATTTGTTTGAGCGACTGTAAGACTTATTTAGCAAAGAAATACGAAGAAAT\n+TAAGAAAATTCTTGCCGAACATCCTATCTTGACGATTTTAGGAATGATAGGGGTTGCCTTATCTGCTCTG\n+GCAATGTACTATTGGTTTTCTAAATCGTTGGATCCTGTAGAAGCCGAGGTTGCTCCTTCTGGTGACGCTA\n+AAACAGTGCGCTTACCAAGGAAACTCGTTGAGATTGGTGCTTCTGGAGATGTTAAAACACAGAAGATTGT\n+GAAACCCGTTGTAGAGACCGAATGGCATCGTAACAATAAAGGAGAGATTGAAATTTCTTGTGATGAATGT\n+GGTATGCATAGGATGTCTGCATTTAACAATATGACAGATGAAGAATTTGATAACTGTACATATGAAGATT\n+TGAATAAGGACCAGAAACGTGAACTTGCCCAGTGGTCTACTAAAGATTCTTGGTTAGGTCGATTCTTTTT\n+GAGTCGAGATCGCAAGAATAAGGTTGGAATTTGGGCCGAAGTGGGACAATCAGGTGATGTTAAAACAAAT\n+AAAGCTCAGATTAAACGTGTTGAAGCTGGAGCCGAAGAATTAGTTACTGTTGCTTTAACTCAAGGTTGTT\n+CTGATGATGCTGCACACAATTTGATGATTGACGTTTTCCAAAAAAATACATATAGAATGTCATACTTCCG\n+TGGAGACAAGCGTTATCAACTTGGAAATTGTACATTTGTTCGTGGTTGGTCTTTTATTATGCCATATCAT\n+TTTGTACAGGCTGTGTTTGCGCGAAG'..b'TATTTTATGTAT\n+CTGTCTTGGTTTGTGGTCACATTTAGTTCACTCTGTTCATATTTATGAAGATAATGTATATATGTGGACT\n+CATTCTCAACCTTCTGGCAATCCTTTCACTGTTATTATTAATTGCTTGTATAATTCGATTATTATGCGAC\n+TGTCATGGATTCGTGTGATGGAGAAATTTCAACCTAGACTTAAGTCCATGAAGTGGTTCAACGAATATGT\n+CGCCTTGATAACATATGGTGACGACAATGTTTTAAACATTGATGCAAAGGTTGTGGAATGGTTTAATCAG\n+ATTAACATTAGTGAGGTTATGACTGAAATGCGACATGAATATACGGACGAAGCTAAAACTGGTGATATTG\n+TTAAATCTCGTAAATTAGAAGATATTTTCTTTTTGAAGAGAAAATTTCGTTTTAGCCCAGAATTACAACG\n+CCATGTTGCTCCATTGAAGATCGAAGTTATTTATGAAATGTTGAATTGGTCTCGCCGCTCTATAGATCCA\n+GATGAAATCTTGATGTCGAACATTGAAACGGCTTTTCGTGAAGTAGTTTACCACGGAAAAGAAGAATACG\n+ATAAACTAAGGTCAGCGGTATTGGCGTTGAAGGTACCCCAGGAACTTCCTGAAAACCCTCAGATTTTGAC\n+GTACAACCAATATTTGCACGATATTGAATATCTTGCGGACCCTTTGTACGACTTTTAGTTAAGATGTGAT\n+CTTGCTTCCTTATACAATTTTGAGAGGTTAATAAGAAGGAAGTAGTGCTATCTTAATAATTAGGTTAACT\n+ATTTAGTTTTACTGTTCAGGATGCCTATTGGCAGCCCCATAATATCCAGGACACCCTCTCTGCTTCTTAT\n+ATGATTAGGTTGTCATTTAGAATAAGAAAATAACCTGCTAACTTTCAAACAAATAATAATAACATTGAAA\n+ATGAAGATCGGAAAATTACTTCCGAGCAAAAAGAGATTGTACACTTTTCTAGTGAAGGAGTTACCCCTAG\n+TACCACTGCGGTGCCTGATATCGTTAGTCTTTCAACAGATTATTTGTCTATGACTACTCGTGAAGATCGT\n+ATCCACACGATTAAAGATTTTCTTTCTCGTCCAATTATAATTCAAACTGGTCTTTGGTCTTCCGCTACAA\n+CTGCCGAAACTCAATTGTATACTGCTAATTTCCCTGAAGTGTTCATTTCTAATACTATGTATCAAGAAAA\n+GTTGCGTGGGTTCGTGGGTTTGCGAGCAACTTTAGTCATTAAAGTGCAAGTGAATTCCCAACCTTTCCAG\n+CAAGGACGATTGATGCTACAGTATTATCCGTATGCACAGTATATGCCTAACCGTGTTTCTTTGGTGAATT\n+CCACTCTCCAAGGACGCTCTGGTTGTCCTCGAACAGATTTGGATTTGAGCGTTGGTACGGAAGTTGAAAT\n+GCGAATTCCTTATGTGTCCCCTCATGTATATTACAATCTTATTACTGGACAAGGATCATTTGGCGCTATA\n+TATTTGGTTGTATATAGCCAACTAAGAGATCAAGTTACAGGAACAGGTTCTGTTGAATATACTGTTTGGG\n+CTCATTTGGAAGATGTAGATGTGCAATACCCGACCGGTGCAAACATTTTCACGGGTAGCTCTCCAAATTT\n+TGCCTCTTTGGGTCAGAAAATGAGTGATGGAAAATTCACTGAAAAAGACTTGAGAGATATTTGGACTTCA\n+AAAGCGTACAATAAACAACCAGACAAAATTTTCGCACAAGTGGCTTCTGAAATAACACAACTCAAAGAAT\n+CAGGAACAATTAGTTCTGGAATTGGACAAGTTTCTGAAGGTCTTTCTACCATGTCTAAAATCCCTATACT\n+CGGAAATATGTTTACAAAACCCGCCTGGATTTCAGCTCAAGTATCTAATATCTTCAAGATGCTTGGTTTT\n+TCAAAACCCACTGTTCAAGGTCTTCCTTGTGAATCGAAACTGCGTGGTCAAGTTCGAATGGCGAATTTTG\n+ATGGCGCTGATACATCACATAAATTGGCTTTGTCTGCCCAAAACGAAATTGAAACAAAATCTGGACTTTC\n+TGGAACTTCTCCTGATGAAATGGATTTATCACACGTCCTTTCCATACCAAATTTTTGGGATCGTTTTACT\n+TGGAACACAACCGATGCCACTAGTTCTATTTTATGGGATAATTATGTTACACCAATGAAAATTAAACCAT\n+ATTCCTCTACAATATTAGATAGATTTAGATGCACTCATATGGGTTTTGTAGCCAACACACACGGTTATTG\n+GTGTGGATCAATAGTTTATACTTTTAAATTTGTTAAGACTCAATTTCATTCTGGACGTTTACGCATTAGT\n+TTTATTCCATTTTATTATAATACGACTATATCTGCAGGAGTTCCCGATGTTTCTCGTACCCAAAAAGTAA\n+TCGTTGATCTGCGCACCTCTACAGAAGTCTCTTTCACTATTCCGTATGTGTCTTCACGACCTTGGATGTA\n+CTGTATTCGTCCTGAAGCTTCGTGGCTTGGAACCGATAATGCTTTGATGTACAACGCCGTTACGGGTATA\n+GTGAGAGTTGAGGTTCTTAACCAGTTGGTTGCCGCTAACAACGTGTTTCAATCTATAGACACTATTGTTG\n+AAGTTAGTGGTGGTCCTGATTTAACTTTTGCAGCACCAATGGCTCCCTCTTATGTTCCTTATTCTGGAGG\n+TTTTACTTTAGCAGATGATGCGGCAGCAAAGAAACAGCGTGAGGAGGAGTATGACAACAACATACCTCAA\n+ACTATTTCTAATCGTGGAAAACGTGAGGTTGAAGATGCTCGTATTGTTGCGCAAGTAATGGGTGAAGATT\n+TAGCTATTCAAAGAAACGATGCTCAACATGGTGTTCATCCAATGACTATAGACACTCATAAGATCGACTC\n+AAATTGGTCTCCGGAAGCGCATTGTATTGGTGAAAAGATTATGTCTATTCGCCAATTGATTAAGCGTTTT\n+GGCATGGCTTTGAACTCCTTGAATTTGATAAGTGATGCACCAAACACCTTGATAGCACCATTTTCAGTTC\n+AGCACCCAACTCCTGTTGTTGCCCCTGCTGAACCCATGTCCCTTTTTGAATATTATTATTTCATTTATGG\n+ATTTTGGAGAGGTGGCATGAGATTTAAACTTCAGGCAGTACGTACAAACTCAGCAGAAACATCAGTTAAA\n+ACCGACACAACTTGGACTGTAAATTTGTGGAATTCTGTACAAGATTCTTTTAATTCTCTAATTAATGTAT\n+TTAGTACTACTGATTACCCTATAAAATCCACAGGAGCACTTCCAGCCGGAACAAGCGGTTTTGGCAATTC\n+GATGACGTATATAGATCCTGAGGTTGAAGGTTTTATGGAATTTGAGATTCCATATTATAATATCTCCCAT\n+ATTTCTCCAGCTACAACCTATGTTCGTGGTACTGAATCTCCTATTACAATTAATAGTGTCTTGCGTGGAC\n+ATTTGCCACCACAAATTGTGGCTGTTGCACCACAGGGCACTATTGCCACTACAGATGTAGTGAACGCTCA\n+ATTTGCTCGTGCTCCTTCTGACGACTTTTCATTTATGTATCTCGTTGGTGTTCCACCACTTACCAACGTC\n+GCTCGTCCCTAACTCCCTTACTATTCTGGATCCTTTAAAATTTATTAGGATAGACAAAAATTAACTCTAT\n+ATTAGATAGTATTAGATTAAGTTTCTTTTTGGTTTTGGGTTTTATTCAGTAACTATCTGCCCTGCTTACA\n+CGGGTATTATTTTTAATTCTTGTCCCTTCTGGACTCTTTTATTTTGTATTTTCAAAATTTTTACTAATTT\n+TTAGTCAGAGTCCTTAGGGGCTACCAGGTTTTTCGCAATTTTCCTGCTTACTGACAGTAATTGCAATTTC\n+GAATTAAAATAATAGTTGTTTTCT\n'