changeset 10:5b4ac70948d2 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
author iuc
date Tue, 27 Mar 2018 09:44:18 -0400
parents 68693743661e
children bfa6c1b8a03c
files gbk2fa.py snpEff.xml snpEff_create_db.xml snpEff_databases.xml snpEff_download.xml snpEff_macros.xml test-data/input.gbk test-data/input.gbk.gz test-data/output_nover.fna test-data/output_ver.fna test-data/pBR322.fna test-data/pBR322.gbk
diffstat 12 files changed, 968 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk2fa.py	Tue Mar 27 09:44:18 2018 -0400
@@ -0,0 +1,43 @@
+import argparse
+import bz2
+import contextlib
+import gzip
+import sys
+
+import magic
+from Bio import SeqIO
+
+parser = argparse.ArgumentParser()
+parser.add_argument("genbank_file", help="GenBank input file. Can be compressed with gzip or bzip2")
+parser.add_argument("fasta_file", help="FASTA output datset")
+parser.add_argument("--remove_version", dest="remove_version", action="store_true", help="Remove version number from NCBI form formatted accession numbers. For example, this would convert 'B000657.2' to 'B000657'")
+args = parser.parse_args()
+
+gbk_filename = args.genbank_file
+fa_filename = args.fasta_file
+
+
+@contextlib.contextmanager
+def get_file_handle(gbk_filename):
+    f_type = magic.from_file(args.genbank_file, mime=True)
+    if f_type == 'text/plain':
+        input_handle = open(gbk_filename, "r")
+    elif f_type == 'application/gzip':
+        input_handle = gzip.open(gbk_filename, "rt")
+    elif f_type == 'application/x-bzip2':
+        input_handle = bz2.open(gbk_filename, "rt")
+    else:
+        sys.exit("Cannot process file of type {}. Only plain, gzip'ed, and bzip2'ed genbank files are accepted ".format(f_type))
+    yield input_handle
+    input_handle.close()
+
+
+with get_file_handle(gbk_filename) as input_handle, open(fa_filename, "w") as output_handle:
+
+    for seq_record in SeqIO.parse(input_handle, "genbank"):
+        if args.remove_version:
+            seq_id = seq_record.id.split('.')[0]
+        else:
+            seq_id = seq_record.id
+        print('Writing FASTA record: {}'.format( seq_id ))
+        output_handle.write(">{}\n{}\n".format(seq_id, seq_record.seq))
--- a/snpEff.xml	Tue Nov 14 05:42:51 2017 -0500
+++ b/snpEff.xml	Tue Mar 27 09:44:18 2018 -0400
@@ -1,13 +1,15 @@
-<tool id="snpEff" name="SnpEff" version="@WRAPPER_VERSION@.1">
-    <description>Variant effect and annotation</description>
+<tool id="snpEff" name="SnpEff eff:" version="@wrapper_version@.1">
+    <description> annotate variants</description>
     <macros>
         <import>snpEff_macros.xml</import>
     </macros>
-    <expand macro="requirements" />
+    <requirements>
+        <expand macro="requirement" />
+    </requirements>
     <expand macro="stdio" />
     <expand macro="version_command" />
     <command><![CDATA[
-        snpEff -Xmx8g eff
+        snpEff @java_options@ eff
         -i $inputFormat -o ${outputConditional.outputFormat} -upDownStreamLen $udLength
         #if $spliceSiteSize and str($spliceSiteSize) != '':
           -spliceSiteSize "$spliceSiteSize"
@@ -72,6 +74,11 @@
             #end for
           #end if
           '${snpDb.snpeff_db.metadata.genome_version}'
+        #elif $snpDb.genomeSrc == 'custom':
+            -dataDir '${snpDb.snpeff_db.extra_files_path}'
+            -configOption '${snpDb.snpeff_db.metadata.genome_version}'.genome='${snpDb.snpeff_db.metadata.genome_version}'
+            -configOption '${snpDb.snpeff_db.metadata.genome_version}'.codonTable='${snpDb.codon_table}'
+            '${snpDb.snpeff_db.metadata.genome_version}'
         #else
           -download
           '$snpDb.genome_version'
@@ -92,7 +99,7 @@
         #end if
     ]]></command>
     <inputs>
-        <param name="input" type="data" format="vcf,tabular,pileup,bed" label="Sequence changes (SNPs, MNPs, InDels)"/>
+        <param name="input" type="data" format="vcf,bed" label="Sequence changes (SNPs, MNPs, InDels)"/>
 
         <param name="inputFormat" type="select" label="Input format">
             <option value="vcf" selected="true">VCF</option>
@@ -116,15 +123,17 @@
 
         <conditional name="snpDb">
             <param name="genomeSrc" type="select" label="Genome source">
-                <option value="cached">Locally installed reference genome</option>
-                <option value="history">Reference genome from your history</option>
-                <option value="named">Named on demand</option>
+                <!-- These options are referenced in the help section of SnpEff download tool. If you change them, change help of SnpEff download as well -->
+                <option value="cached">Locally installed snpEff database</option>
+                <option value="history">Downloaded snpEff database in your history</option>
+                <option value="named">Download on demand</option>
+                <option value="custom">Custom snpEff database in your history</option>
             </param>
             <when value="cached">
                 <param name="genomeVersion" type="select" label="Genome">
                     <!--GENOME    DESCRIPTION-->
                     <options from_data_table="snpeffv_genomedb">
-                            <filter type="static_value" name="snpeff_version" value="@SNPEFF_VERSION@" column="1"/>
+                            <filter type="static_value" name="snpeff_version" value="@snpeff_version@" column="1"/>
                             <filter type="unique_value" column="2" />
                     </options>
                 </param>
@@ -138,11 +147,11 @@
                 </section>
             </when>
             <when value="history">
-                <param name="snpeff_db" type="data" format="snpeffdb" label="@SNPEFF_VERSION@ Genome Data">
+                <param name="snpeff_db" type="data" format="snpeffdb" label="@snpeff_version@ Genome Data">
                     <options options_filter_attribute="metadata.snpeff_version" >
-                        <filter type="add_value" value="@SNPEFF_VERSION@" />
+                        <filter type="add_value" value="@snpeff_version@" />
                     </options>
-                    <validator type="expression" message="This version of SnpEff will only work with @SNPEFF_VERSION@ genome databases">value is not None and value.metadata.snpeff_version == "@SNPEFF_VERSION@"</validator>
+                    <validator type="expression" message="This version of SnpEff will only work with @snpeff_version@ genome databases">value is not None and value.metadata.snpeff_version == "@snpeff_version@"</validator>
                 </param>
                 <section name="reg_section" expanded="false" title="Regulation options">
                     <!-- From metadata -->
@@ -155,13 +164,48 @@
             </when>
             <when value="named">
                 <param name="genome_version" type="text" value="" label="Snpff Genome Version Name (e.g. GRCh38.76)">
-                    <help>@SNPEFF_DATABASE_URL@</help>
+                    <help>@snpeff_database_url@</help>
                     <validator type="regex" message="A genome version name is required">\S+</validator>
                 </param>
             </when>
+            <when value="custom">
+                <param name="snpeff_db" type="data" format="snpeffdb" label="@snpeff_version@ Genome Data">
+                    <options options_filter_attribute="metadata.snpeff_version" >
+                        <filter type="add_value" value="@snpeff_version@" />
+                    </options>
+                    <validator type="expression" message="This version of SnpEff will only work with @snpeff_version@ genome databases">value is not None and value.metadata.snpeff_version == "@snpeff_version@"</validator>
+                </param>
+                <param name="codon_table" type="select" label="Select genetic code for this sequence" help="If this sequence uses non-standard genetic code, select one from these options">
+                    <option selected="true" value="Standard">Standard</option>
+                    <option value="Vertebrate_Mitochondrial">Vertebrate_Mitochondrial</option>
+                    <option value="Yeast_Mitochondrial">Yeast_Mitochondrial</option>
+                    <option value="Mold_Mitochondrial">Mold_Mitochondrial</option>
+                    <option value="Protozoan_Mitochondrial">Protozoan_Mitochondrial</option>
+                    <option value="Coelenterate">Coelenterate</option>
+                    <option value="Mitochondrial">Mitochondrial</option>
+                    <option value="Mycoplasma">Mycoplasma</option>
+                    <option value="Spiroplasma">Spiroplasma</option>
+                    <option value="Invertebrate_Mitochondrial">Invertebrate_Mitochondrial</option>
+                    <option value="Ciliate_Nuclear">Ciliate_Nuclear</option>
+                    <option value="Dasycladacean_Nuclear">Dasycladacean_Nuclear</option>
+                    <option value="Hexamita_Nuclear">Hexamita_Nuclear</option>
+                    <option value="Echinoderm_Mitochondrial">Echinoderm_Mitochondrial</option>
+                    <option value="Flatworm_Mitochondrial">Flatworm_Mitochondrial</option>
+                    <option value="Euplotid_Nuclear">Euplotid_Nuclear</option>
+                    <option value="Bacterial_and_Plant_Plastid">Bacterial_and_Plant_Plastid</option>
+                    <option value="Alternative_Yeast_Nuclear">Alternative_Yeast_Nuclear</option>
+                    <option value="Ascidian_Mitochondrial">Ascidian_Mitochondrial</option>
+                    <option value="Alternative_Flatworm_Mitochondrial">Alternative_Flatworm_Mitochondrial</option>
+                    <option value="Blepharisma_Macronuclear">Blepharisma_Macronuclear</option>
+                    <option value="Chlorophycean_Mitochondrial">Chlorophycean_Mitochondrial</option>
+                    <option value="Trematode_Mitochondrial">Trematode_Mitochondrial</option>
+                    <option value="Scenedesmus_obliquus_Mitochondrial">Scenedesmus_obliquus_Mitochondrial</option>
+                    <option value="Thraustochytrium_Mitochondrial">Thraustochytrium_Mitochondrial</option>
+            </param>
+            </when>
         </conditional>
 
-        <param name="udLength" type="select" label="Upstream / Downstream length">
+        <param name="udLength" type="select" label="Upstream / Downstream length" argument="-ud" >
             <option value="0">No upstream / downstream intervals (0 bases)</option>
             <option value="200">200 bases</option>
             <option value="500">500 bases</option>
@@ -172,7 +216,7 @@
             <option value="20000">20000 bases</option>
         </param>
 
-        <param name="spliceSiteSize" type="select" optional="true" label="Set size for splice sites (donor and acceptor) in bases">
+        <param name="spliceSiteSize" type="select" optional="true" label="Set size for splice sites (donor and acceptor) in bases" argument="-ss">
             <option value="1">1 base</option>
             <option value="2" selected="true">2 bases</option>
             <option value="3">3 bases</option>
@@ -191,9 +235,9 @@
             </param>
             <when value="no"/>
             <when value="yes">
-                <param name="spliceRegionExonSize" type="integer" value="" min="1" max="10" optional="true" label="Set size for splice site region within exons. Default: 3 bases"/>
-                <param name="spliceRegionIntronMin" type="integer" value="" min="1" max="10" optional="true" label="Set minimum number of bases for splice site region within intron. Default: 3 bases"/>
-                <param name="spliceRegionIntronMax" type="integer" value="" min="1" max="10" optional="true" label="Set maximum number of bases for splice site region within intron. Default: 8 bases"/>
+                <param name="spliceRegionExonSize" type="integer" value="" min="1" max="10" optional="true" label="Set size for splice site region within exons. Default: 3 bases" argument="-spliceRegionExonSize"/>
+                <param name="spliceRegionIntronMin" type="integer" value="" min="1" max="10" optional="true" label="Set minimum number of bases for splice site region within intron. Default: 3 bases" argument="-spliceRegionIntronMin"/>
+                <param name="spliceRegionIntronMax" type="integer" value="" min="1" max="10" optional="true" label="Set maximum number of bases for splice site region within intron. Default: 8 bases" argument="-spliceRegionIntronMax"/>
             </when>
         </conditional>
 
@@ -223,7 +267,7 @@
             -->
         </param>
         <!-- -cancerSamples <file>           : Two column TXT file defining 'oringinal \t derived' samples. -->
-        <param name="intervals" type="data" format="bed" optional="true" label="Use custom interval file for annotation"/>
+        <param name="intervals" type="data" format="bed" optional="true" label="Use custom interval file for annotation" argument="-interval"/>
         <param name="transcripts" type="data" format="tabular" optional="true" label="Only use the transcripts in this file" help="Format is one transcript ID per line"/>
         <param name="filterOut" type="select" display="checkboxes" multiple="true" label="Filter output">
             <option value="-no-downstream">Do not show DOWNSTREAM changes</option>
@@ -286,20 +330,20 @@
             </when>
         </conditional>
 
-        <param name="offset" type="select" display="radio" label="Chromosomal position">
+        <param name="offset" type="select" display="radio" label="Chromosomal position" argument="-0 and -1">
             <option value="default" selected="true">Use default (based on input type)</option>
             <option value="-0">Force zero-based positions (both input and output)</option>
             <option value="-1">Force one-based positions (both input and output)</option>
         </param>
-        <param name="chr" type="text" label="Text to prepend to chromosome name">
+        <param name="chr" type="text" label="Text to prepend to chromosome name" argument="-chr">
             <help>
                By default SnpEff simplifies all chromosome names. For instance 'chr1' is just '1'.
                You can prepend any string you want to the chromosome name
             </help>
             <validator type="regex" message="No whitespace allowed">^\S*$</validator>
         </param>
-        <param name="generate_stats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats"/>
-        <param name="noLog" type="boolean" truevalue="-noLog" falsevalue="" checked="true" label="Suppress reporting usage statistics to server"/>
+        <param name="generate_stats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats" argument="-noStats"/>
+        <param name="noLog" type="boolean" truevalue="-noLog" falsevalue="" checked="true" label="Suppress reporting usage statistics to server" argument="-noLog"/>
     </inputs>
     <outputs>
         <data name="snpeff_output" format="vcf">
@@ -330,9 +374,34 @@
         </test>
     </tests>
     <help><![CDATA[
-This tool calculate the effect of variants (SNPs/MNPs/Insertions) and deletions.
+
+**What it does**
+
+SnpEff is a variant annotation and effect prediction tool. It annotates and predicts the effects of genetic variants (such as amino acid changes).
+
+A typical SnpEff use case would be:
+
+ - **Input**: The inputs are predicted variants (SNPs, insertions, deletions and MNPs). The input file is usually obtained as a result of a sequencing experiment, and it is usually in variant call format (VCF).
+ - **Output**: SnpEff analyzes the input variants. It annotates the variants and calculates the effects they produce on known genes (e.g. amino acid changes). A list of effects and annotations that SnpEff can calculate can be found here.
+
+By genetic variant we mean difference between a genome and a "reference" genome. As an example, imagine we are sequencing a "sample". Here "sample" can mean anything that you are interested in studying, from a cell culture, to a mouse or a cancer patient.  It is a standard procedure to compare your sample sequences against the corresponding "reference genome". For instance you may compare the cancer patient genome against the "reference genome".
 
-@EXTERNAL_DOCUMENTATION@
+In a typical sequencing experiment, you will find many places in the genome where your sample differs from the reference genome. These are called "genomic variants" or just "variants". 
+Typically, variants are categorized as follows:
+
+ - SNP (Single-Nucleotide Polymorphism) Reference = 'A', Sample = 'C'
+ - Ins (Insertion) Reference = 'A', Sample = 'AGT'
+ - Del (Deletion) Reference = 'AC', Sample = 'C'
+ - MNP (Multiple-nucleotide polymorphism) Reference = 'ATA', Sample = 'GTC'
+ - MIXED (Multiple-nucleotide and an InDel) Reference = 'ATA', Sample = 'GTCAGT'
+
+This is not a comprehensive list, it is just to give you an idea.
+
+Suppose you have a huge file describing all the differences between your sample and the reference genome. But you want to know more about these variants than just their genetic coordinates. E.g.: Are they in a gene? In an exon? Do they change protein coding? Do they cause premature stop codons? SnpEff can help you answer all these questions. The process of adding this information about the variants is called "Annotation". 
+SnpEff provides several degrees of annotations, from simple (e.g. which gene is each variant affecting) to extremely complex annotations (e.g. will this non-coding variant affect the expression of a gene?). It should be noted that the more complex the annotations, the more it relies in computational predictions. Such computational predictions can be incorrect, so results from SnpEff (or any prediction algorithm) cannot be trusted blindly, they must be analyzed and independently validated by corresponding wet-lab experiments.
+
+@snpeff_in_galaxy_info@
+@external_documentation@
 ]]>
     </help>
     <expand macro="citations" />
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpEff_create_db.xml	Tue Mar 27 09:44:18 2018 -0400
@@ -0,0 +1,128 @@
+<tool id="snpEff_build_gb" name="SnpEff build:" version="@wrapper_version@.1">
+    <description> database from Genbank record</description>
+    <macros>
+        <import>snpEff_macros.xml</import>
+    </macros>
+    <requirements>
+        <expand macro="requirement" />
+        <requirement type="package" version="3.6">python</requirement>
+        <requirement type="package" version="1.70">biopython</requirement>
+        <requirement type="package" version="0.4.15">python-magic</requirement>
+        <requirement type="package" version="5.32">libmagic</requirement>
+    </requirements>
+    <expand macro="stdio" />
+    <expand macro="version_command" />
+    <command><![CDATA[
+
+        #if str( $fasta.fasta_selector ) == "yes":
+            python3 '$__tool_directory__/gbk2fa.py' '${input_gbk}' '${output_fasta}'
+            #if $fasta.remove_version:
+                '${fasta.remove_version}'
+            #end if
+            &&
+        #end if
+
+        mkdir -p '${snpeff_output.files_path}'/'${genome_version}' &&
+        
+        ln -s '${input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk &&
+
+        snpEff @java_options@ build -v 
+        -configOption '${genome_version}'.genome='${genome_version}' 
+        -configOption '${genome_version}'.codonTable='${codon_table}'
+        -genbank -dataDir '$snpeff_output.files_path' '$genome_version'
+
+    ]]></command>
+    <inputs>
+        <param name="genome_version" type="text" value="" label="Name for the database" help="for E. coli K12 you may want to use 'EcK12' etc.">
+            <validator type="regex" message="A genome version name is required">\S+</validator>
+        </param>
+        <param name="input_gbk" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/>
+        <param name="codon_table" type="select" label="Select genetic code for this sequence" help="If this sequence uses non-standard genetic code, select one from these options">
+            <option selected="true" value="Standard">Standard</option>
+            <option value="Vertebrate_Mitochondrial">Vertebrate_Mitochondrial</option>
+            <option value="Yeast_Mitochondrial">Yeast_Mitochondrial</option>
+            <option value="Mold_Mitochondrial">Mold_Mitochondrial</option>
+            <option value="Protozoan_Mitochondrial">Protozoan_Mitochondrial</option>
+            <option value="Coelenterate">Coelenterate</option>
+            <option value="Mitochondrial">Mitochondrial</option>
+            <option value="Mycoplasma">Mycoplasma</option>
+            <option value="Spiroplasma">Spiroplasma</option>
+            <option value="Invertebrate_Mitochondrial">Invertebrate_Mitochondrial</option>
+            <option value="Ciliate_Nuclear">Ciliate_Nuclear</option>
+            <option value="Dasycladacean_Nuclear">Dasycladacean_Nuclear</option>
+            <option value="Hexamita_Nuclear">Hexamita_Nuclear</option>
+            <option value="Echinoderm_Mitochondrial">Echinoderm_Mitochondrial</option>
+            <option value="Flatworm_Mitochondrial">Flatworm_Mitochondrial</option>
+            <option value="Euplotid_Nuclear">Euplotid_Nuclear</option>
+            <option value="Bacterial_and_Plant_Plastid">Bacterial_and_Plant_Plastid</option>
+            <option value="Alternative_Yeast_Nuclear">Alternative_Yeast_Nuclear</option>
+            <option value="Ascidian_Mitochondrial">Ascidian_Mitochondrial</option>
+            <option value="Alternative_Flatworm_Mitochondrial">Alternative_Flatworm_Mitochondrial</option>
+            <option value="Blepharisma_Macronuclear">Blepharisma_Macronuclear</option>
+            <option value="Chlorophycean_Mitochondrial">Chlorophycean_Mitochondrial</option>
+            <option value="Trematode_Mitochondrial">Trematode_Mitochondrial</option>
+            <option value="Scenedesmus_obliquus_Mitochondrial">Scenedesmus_obliquus_Mitochondrial</option>
+            <option value="Thraustochytrium_Mitochondrial">Thraustochytrium_Mitochondrial</option>
+        </param>
+        <conditional name="fasta">
+            <param name="fasta_selector" type="select" display="radio" label="Parse Genbank into Fasta" help="This will generate an additional dataset containing all sequences from Genbank file in FASTA format">
+                <option value="yes" selected="true">Yes</option>
+                <option value="no">No</option>
+            </param>
+            <when value="yes">
+                <param type="boolean" name="remove_version" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have vesion numbers such as B000564.2. This option removes them leaving only B000564" argument="--remove_version"/>
+            </when>
+            <when value="no"/>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="snpeff_output" format="snpeffdb" label="@snpeff_version@ database for ${genome_version}"/>
+        <data name="output_fasta" format="fasta" label="Fasta sequecnes for ${genome_version}">
+            <filter>fasta['fasta_selector'] == 'yes'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="genome_version" value="pBR322"/>
+            <param name="input_gbk" value="pBR322.gbk" />
+            <output name="snpeff_output">
+                <assert_contents>
+                    <has_text text="pBR322" />
+                </assert_contents>
+            </output>
+            <output name="output_fasta" value="pBR322.fna"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool uses `"snpEff build -genbank"` command to create a snpEff database from a Genbank dataset. If **Parse Genbank into Fasta** is selected (the default behavior) the tool will also convert Genbank dataset into a single FASTA dataset.
+
+
+Using Genbank data for creating databases has several advantages:
+
+ #. Genbank files contains annotations (such as locations of genes) together with sequences. This was one ensures that these two are in sync with each other
+ #. When you are analyzing small genomes it is much more convenient to create a database on the fly and use it.
+
+-------
+
+.. class:: infomark
+
+**The usage scenario**
+
+Suppose you have a series of Illumina reads from an experiment involving *E. coli* K-12 MG1655. You want to map these reads to the reference genome of K-12 MG1655, call variants, and annotate them using snpEff. This tool enables you to follow the following analysis steps:
+
+ #. Download genome from `NCBI <https://www.ncbi.nlm.nih.gov>`_ into Galaxy.
+ #. Use this tool to generate a snpEff database and FASTA sequences from the file you downloaded at step 1.
+ #. Use your Illumina reads to map against FASTA dataset generated in the previous step using BWA-MEM.
+ #. Call variants using **Freebayes**.
+ #. Annotate vcf output of Freebayes with **SnpEff eff** using database generated at step 2 (using *Custom* option for **Genome source** parameter).
+
+In this scenario Genbank dataset is used twice. First, it is used to produce FASTA sequences that are using by BWA to map against. Second, it is used to create snpEff database. This guarantees that you will not have any issues related to reference sequence naming.
+
+@snpeff_in_galaxy_info@
+@external_documentation@
+]]>
+    </help>
+    <expand macro="citations" />
+</tool>
--- a/snpEff_databases.xml	Tue Nov 14 05:42:51 2017 -0500
+++ b/snpEff_databases.xml	Tue Mar 27 09:44:18 2018 -0400
@@ -1,18 +1,47 @@
-<tool id="snpEff_databases" name="SnpEff available databases" version="@WRAPPER_VERSION@.1">
-    <description></description>
+<tool id="snpEff_databases" name="SnpEff databases:" version="@wrapper_version@.1">
+    <description> list available databases</description>
     <macros>
         <import>snpEff_macros.xml</import>
     </macros>
-    <expand macro="requirements" />
+    <requirements>
+        <expand macro="requirement" />
+    </requirements>
     <expand macro="stdio" />
     <expand macro="version_command" />
     <command><![CDATA[
-snpEff databases | grep -v '^---' | sed 's/^Genome/#Genome/' | sed 's/  *//g' > '$snpeff_dbs'
+
+        snpEff databases | grep -v '^---' | sed 's/^Genome/#Genome/' | sed 's/  *//g' 
+
+        #if $include_pattern:
+             | grep '${include_pattern}' 
+        #end if
+
+        #if $exclude_pattern:
+             | grep -v '${exclude_pattern}' 
+        #end if
+
+        > '${snpeff_dbs}'
+            
     ]]></command>
     <inputs>
+        <param name="include_pattern" size="40" type="text" value="" optional="True" label="List entries matching the following expression" help="Databases matching this expression will be listed. Here you can enter text or regular expression. For example, to show only mouse databases use 'Mouse'. Note that this parameter is case-sensitive.">
+            <sanitizer>
+                <valid initial="string.digits,string.letters">
+                    <add value="^" />
+                </valid>
+            </sanitizer>
+        </param>
+        <param name="exclude_pattern" size="40" type="text" value="" optional="True" label="Do not output entries matching the following expression" help="Databases matching this expression WILL NOT BE listed. Here you can enter text or regular expression. For excample, to avoid all ENSEMBL bundles enter 'ENSEMBL'. Note that this parameter is case-sensitive.">
+            <sanitizer>
+                <valid initial="string.digits,string.letters">
+                    <add value="^" />
+                </valid>
+            </sanitizer>
+        </param>
+
     </inputs>
     <outputs>
-        <data format="tabular" name="snpeff_dbs" label="${tool.name} @SNPEFF_VERSION@ available databases" />
+        <data format="tabular" name="snpeff_dbs" label="${tool.name} @snpeff_version@ available databases" />
     </outputs>
     <tests>
         <test>
@@ -22,9 +51,39 @@
                 </assert_contents>
             </output>
         </test>
+         <test>
+            <param name="include_pattern" value="ebola"/>
+            <output name="snpeff_dbs">
+                <assert_contents>
+                    <has_text text="ebola_zaire" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help><![CDATA[
-@EXTERNAL_DOCUMENTATION@
+
+**What it does**
+
+This tool downloads the master list of snpEff databases from @snpeff_database_url@. You can then look at this list and decide which database to use for your analysis. For example, if **List entries matching the following expression** parameter of this tool is set to *Mouse* the it will produce a tabular dataset with the following content::
+
+    mm10  Mouse  http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm10.zip
+    mm9   Mouse  http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm9.zip
+
+This means that there two available snpEff databases for mouse genome versions mm9 and mm10. In order to download these databases you should use identifier from the first column (e.g., mm9 or mm10 in this case). 
+
+-------
+
+.. class:: infomark
+
+**The usage scenario**
+
+There are two ways to use names of databases obtained with this tool in Galaxy's version on snpEff:
+
+    #. Use **SnpEff download** tool. It will download the database to the history and you will be able to use it in **SnpEff eff** tool using *Downloaded snpEff database in your history* option of the **Genome source** parameter.
+    #. Use *Download on demand* option of the **SnpEff eff** tool (again, **Genome source** parameter). In this case snpEff will download the database before performing annotation. 
+
+@snpeff_in_galaxy_info@
+@external_documentation@
     ]]></help>
     <expand macro="citations" />
 </tool>
--- a/snpEff_download.xml	Tue Nov 14 05:42:51 2017 -0500
+++ b/snpEff_download.xml	Tue Mar 27 09:44:18 2018 -0400
@@ -1,22 +1,24 @@
-<tool id="snpEff_download" name="SnpEff Download" version="@WRAPPER_VERSION@.1">
-    <description>Download a new database</description>
+<tool id="snpEff_download" name="SnpEff download:" version="@wrapper_version@.1">
+    <description> download a pre-built database</description>
     <macros>
         <import>snpEff_macros.xml</import>
     </macros>
-    <expand macro="requirements" />
+    <requirements>
+        <expand macro="requirement" />
+    </requirements>
     <expand macro="stdio" />
     <expand macro="version_command" />
     <command><![CDATA[
 snpEff download -dataDir '$snpeff_db.files_path' -v '$genome_version'
     ]]></command>
     <inputs>
-        <param name="genome_version" type="text" value="" label="Select the genome version you want to download (e.g. GRCh38.86, GRCh37.75, hg38, or GRCm38.86)">
-            <help>@SNPEFF_DATABASE_URL@</help>
+        <param name="genome_version" type="text" value="" label="Select the annotation database you want to download (e.g. GRCh38.86, mm10 etc.)" help="The list of available databases can be obtained with 'SnpEff databases' tool">
+            <help>@snpeff_database_url@</help>
             <validator type="regex" message="A genome version name is required">\S+</validator>
         </param>
     </inputs>
     <outputs>
-        <data format="snpeffdb" name="snpeff_db" label="${tool.name} @SNPEFF_VERSION@ ${genome_version}" />
+        <data format="snpeffdb" name="snpeff_db" label="${tool.name} @snpeff_version@ ${genome_version}"/>
     </outputs>
     <tests>
         <test>
@@ -29,7 +31,24 @@
         </test>
     </tests>
     <help><![CDATA[
-@EXTERNAL_DOCUMENTATION@
+
+**What it does**
+
+This tool downloads a specified database from @snpeff_database_url@. It deposits it into the history. 
+
+-------
+
+.. class:: infomark
+
+**The usage scenario**
+
+Suppose you want to annoate a VCF file containing variants within mm10 version of the Mouse genome. To do this you can:
+
+    #. Download mm10 snpEff database by typing *mm10* into **Select the annotation database...** text box.
+    #. Use **SnpEff eff** by choosing the downloaded database from the history using *Downloaded snpEff database in your history* option of the **Genome source** parameter.
+    
+@snpeff_in_galaxy_info@
+@external_documentation@
     ]]></help>
     <expand macro="citations" />
 </tool>
--- a/snpEff_macros.xml	Tue Nov 14 05:42:51 2017 -0500
+++ b/snpEff_macros.xml	Tue Mar 27 09:44:18 2018 -0400
@@ -1,8 +1,6 @@
 <macros>
-    <xml name="requirements">
-        <requirements>
-            <requirement type="package" version="4.3.1r">snpeff</requirement>
-        </requirements>
+    <xml name="requirement">
+        <requirement type="package" version="4.3.1t">snpeff</requirement>
     </xml>
   <xml name="stdio">
     <stdio>
@@ -15,16 +13,61 @@
 snpEff -version
     ]]></version_command>
   </xml>
-  <token name="@WRAPPER_VERSION@">4.3r</token>
-  <token name="@SNPEFF_VERSION@">SnpEff4.3</token>
-  <token name="@SNPEFF_DATABASE_URL@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token>
-  <token name="@EXTERNAL_DOCUMENTATION@">
-For details about this tool, please go to: http://snpeff.sourceforge.net/SnpEff_manual.html
+  <token name="@wrapper_version@">4.3.1t</token>
+  <token name="@snpeff_version@">SnpEff4.3</token>
+  <token name="@snpeff_database_url@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token>
+  <token name="@java_options@">-Xmx\${GALAXY_MEMORY_MB:-8192}m</token>
+  <token name="@external_documentation@">
+
+
+-------
+
+To learn more about snpEff read its manual at http://snpeff.sourceforge.net/SnpEff_manual.html
   </token>
+  <token name="@snpeff_in_galaxy_info@">
+
+-------
+
+.. class:: warningmark
+
+**Using SnpEff in Galaxy: A few points to remember**
+
+SnpEff relies on specially formatted databases to generate annotations. It will not work without them. There are several ways in which these databases can be obtained.
+
+**Pre-cached databases**
+
+Many standard (e.g., human, mouse, *Drosophila*) databases are likely pre-cached within a given Galaxy instance. You should be able to see them listed in **Genome** drop-down of **SbpEff eff** tool. 
+
+In you *do not see them* keep reading...
+
+**Download pre-built databases**
+
+SnpEff project generates large numbers of pre-build databases. These are available at @SNPEFF_DATABASE_URL@ and can downloaded. Follow these steps:
+
+  #. Use **SnpEff databases** tool to generate a list of existing databases. Note the name of the database you need.
+  #. Use **SnpEff download** tool to download the database.
+  #. Finally, use **SnpEff eff** by choosing the downloaded database from the history using *Downloaded snpEff database in your history* option of the **Genome source** parameter.
+
+Alternatively, you can specify the name of the database directly in **SnpEff eff** using the *Download on demand* option (again, **Genome source** parameter). In this case snpEff will download the database before performing annotation. 
+
+**Create your own database**
+
+In cases when you are dealing with bacterial or viral (or, frankly, any other) genomes it may be easier to create database yourself. For this you need:
+
+ #. Download Genbank record corresponding to your genome of interest from from NCBI. 
+ #. Use **SnpEff build** to create the database. 
+ #. Use the database in **SnpEff eff** (using *Custom* option for **Genome source** parameter).
+
+Creating custom database has one benefit. The **SnpEff build** tool normally produces two outputs: (1) a SnpEff database and (2) FASTA file containing sequences from the Genbank file. If you are performing your experiment from the beginning by mapping reads against a genome and finding variants before annotating them with SnpEff you can use **this FASTA file** as a reference to map your reads against. This will guarantee that you will not have any issues related to reference sequence naming -- the most common source of SnpEff errors. 
+
+</token>
+
+
+
   <xml name="citations">
       <citations>
         <citation type="doi">10.4161/fly.19695</citation>
         <yield />
       </citations>
   </xml>
-</macros>
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.gbk	Tue Mar 27 09:44:18 2018 -0400
@@ -0,0 +1,79 @@
+LOCUS       LC129268                2808 bp    DNA     linear   SYN 19-MAR-2016
+DEFINITION  Synthetic construct DNA, plasmid vector pUC18 including artificial
+            sequence.
+ACCESSION   LC129268
+VERSION     LC129268.1
+KEYWORDS    .
+SOURCE      synthetic construct
+  ORGANISM  synthetic construct
+            other sequences; artificial sequences.
+REFERENCE   1
+  AUTHORS   Takahashi,M., Kita,Y., Mizuno,A. and Goto-Yamamoto,N.
+  TITLE     Evaluation of method bias in bacterial community analysis
+  JOURNAL   Unpublished
+REFERENCE   2  (bases 1 to 2808)
+  AUTHORS   Takahashi,M. and Goto-Yamamoto,N.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (03-MAR-2016) Contact:Masayuki Takahashi National
+            Research Institute of Brewing, Technology Development Research
+            Division; 3-7-1 Kagamiyama, Higashi-hiroshima, Hiroshima 739-0046,
+            Japan URL :http://www.nrib.go.jp/index.html
+FEATURES             Location/Qualifiers
+     source          1..2808
+                     /organism="synthetic construct"
+                     /mol_type="other DNA"
+                     /db_xref="taxon:32630"
+                     /note="plasmid vector pUC18 including artificial sequence"
+     misc_feature    439..560
+                     /note="internal standard DNA for quantification of
+                     microbial rDNA using quantitative PCR"
+ORIGIN      
+        1 tcgcgcgttt cggtgatgac ggtgaaaacc tctgacacat gcagctcccg gagacggtca
+       61 cagcttgtct gtaagcggat gccgggagca gacaagcccg tcagggcgcg tcagcgggtg
+      121 ttggcgggtg tcggggctgg cttaactatg cggcatcaga gcagattgta ctgagagtgc
+      181 accatatgcg gtgtgaaata ccgcacagat gcgtaaggag aaaataccgc atcaggcgcc
+      241 attcgccatt caggctgcgc aactgttggg aagggcgatc ggtgcgggcc tcttcgctat
+      301 tacgccagct ggcgaaaggg ggatgtgctg caaggcgatt aagttgggta acgccagggt
+      361 tttcccagtc acgacgttgt aaaacgacgg ccagtgccaa gcttgcatgc ctgcaggtcg
+      421 actctagagg atccccggaa ctaatacgac tcactatagg gtccgatctt ccgaggtctc
+      481 atatcgatcg gtagggcatc taatggcttc ggagttcaag ggctatattc gccatgtcag
+      541 atttgtatgc caaaggccgg gtaccgagct cgaattcgta atcatggtca tagctgtttc
+      601 ctgtgtgaaa ttgttatccg ctcacaattc cacacaacat acgagccgga agcataaagt
+      661 gtaaagcctg gggtgcctaa tgagtgagct aactcacatt aattgcgttg cgctcactgc
+      721 ccgctttcca gtcgggaaac ctgtcgtgcc agctgcatta atgaatcggc caacgcgcgg
+      781 ggagaggcgg tttgcgtatt gggcgctctt ccgcttcctc gctcactgac tcgctgcgct
+      841 cggtcgttcg gctgcggcga gcggtatcag ctcactcaaa ggcggtaata cggttatcca
+      901 cagaatcagg ggataacgca ggaaagaaca tgtgagcaaa aggccagcaa aaggccagga
+      961 accgtaaaaa ggccgcgttg ctggcgtttt tccataggct ccgcccccct gacgagcatc
+     1021 acaaaaatcg acgctcaagt cagaggtggc gaaacccgac aggactataa agataccagg
+     1081 cgtttccccc tggaagctcc ctcgtgcgct ctcctgttcc gaccctgccg cttaccggat
+     1141 acctgtccgc ctttctccct tcgggaagcg tggcgctttc tcatagctca cgctgtaggt
+     1201 atctcagttc ggtgtaggtc gttcgctcca agctgggctg tgtgcacgaa ccccccgttc
+     1261 agcccgaccg ctgcgcctta tccggtaact atcgtcttga gtccaacccg gtaagacacg
+     1321 acttatcgcc actggcagca gccactggta acaggattag cagagcgagg tatgtaggcg
+     1381 gtgctacaga gttcttgaag tggtggccta actacggcta cactagaagg acagtatttg
+     1441 gtatctgcgc tctgctgaag ccagttacct tcggaaaaag agttggtagc tcttgatccg
+     1501 gcaaacaaac caccgctggt agcggtggtt tttttgtttg caagcagcag attacgcgca
+     1561 gaaaaaaagg atctcaagaa gatcctttga tcttttctac ggggtctgac gctcagtgga
+     1621 acgaaaactc acgttaaggg attttggtca tgagattatc aaaaaggatc ttcacctaga
+     1681 tccttttaaa ttaaaaatga agttttaaat caatctaaag tatatatgag taaacttggt
+     1741 ctgacagtta ccaatgctta atcagtgagg cacctatctc agcgatctgt ctatttcgtt
+     1801 catccatagt tgcctgactc cccgtcgtgt agataactac gatacgggag ggcttaccat
+     1861 ctggccccag tgctgcaatg ataccgcgag acccacgctc accggctcca gatttatcag
+     1921 caataaacca gccagccgga agggccgagc gcagaagtgg tcctgcaact ttatccgcct
+     1981 ccatccagtc tattaattgt tgccgggaag ctagagtaag tagttcgcca gttaatagtt
+     2041 tgcgcaacgt tgttgccatt gctacaggca tcgtggtgtc acgctcgtcg tttggtatgg
+     2101 cttcattcag ctccggttcc caacgatcaa ggcgagttac atgatccccc atgttgtgca
+     2161 aaaaagcggt tagctccttc ggtcctccga tcgttgtcag aagtaagttg gccgcagtgt
+     2221 tatcactcat ggttatggca gcactgcata attctcttac tgtcatgcca tccgtaagat
+     2281 gcttttctgt gactggtgag tactcaacca agtcattctg agaatagtgt atgcggcgac
+     2341 cgagttgctc ttgcccggcg tcaatacggg ataataccgc gccacatagc agaactttaa
+     2401 aagtgctcat cattggaaaa cgttcttcgg ggcgaaaact ctcaaggatc ttaccgctgt
+     2461 tgagatccag ttcgatgtaa cccactcgtg cacccaactg atcttcagca tcttttactt
+     2521 tcaccagcgt ttctgggtga gcaaaaacag gaaggcaaaa tgccgcaaaa aagggaataa
+     2581 gggcgacacg gaaatgttga atactcatac tcttcctttt tcaatattat tgaagcattt
+     2641 atcagggtta ttgtctcatg agcggataca tatttgaatg tatttagaaa aataaacaaa
+     2701 taggggttcc gcgcacattt ccccgaaaag tgccacctga cgtctaagaa accattatta
+     2761 tcatgacatt aacctataaa aataggcgta tcacgaggcc ctttcgtc
+//
+
Binary file test-data/input.gbk.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_nover.fna	Tue Mar 27 09:44:18 2018 -0400
@@ -0,0 +1,2 @@
+>LC129268
+TCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGCCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGAACTAATACGACTCACTATAGGGTCCGATCTTCCGAGGTCTCATATCGATCGGTAGGGCATCTAATGGCTTCGGAGTTCAAGGGCTATATTCGCCATGTCAGATTTGTATGCCAAAGGCCGGGTACCGAGCTCGAATTCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_ver.fna	Tue Mar 27 09:44:18 2018 -0400
@@ -0,0 +1,2 @@
+>LC129268.1
+TCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGCCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGAACTAATACGACTCACTATAGGGTCCGATCTTCCGAGGTCTCATATCGATCGGTAGGGCATCTAATGGCTTCGGAGTTCAAGGGCTATATTCGCCATGTCAGATTTGTATGCCAAAGGCCGGGTACCGAGCTCGAATTCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pBR322.fna	Tue Mar 27 09:44:18 2018 -0400
@@ -0,0 +1,2 @@
+>J01749
+TTCTCATGTTTGACAGCTTATCATCGATAAGCTTTAATGCGGTAGTTTATCACAGTTAAATTGCTAACGCAGTCAGGCACCGTGTATGAAATCTAACAATGCGCTCATCGTCATCCTCGGCACCGTCACCCTGGATGCTGTAGGCATAGGCTTGGTTATGCCGGTACTGCCGGGCCTCTTGCGGGATATCGTCCATTCCGACAGCATCGCCAGTCACTATGGCGTGCTGCTAGCGCTATATGCGTTGATGCAATTTCTATGCGCACCCGTTCTCGGAGCACTGTCCGACCGCTTTGGCCGCCGCCCAGTCCTGCTCGCTTCGCTACTTGGAGCCACTATCGACTACGCGATCATGGCGACCACACCCGTCCTGTGGATCCTCTACGCCGGACGCATCGTGGCCGGCATCACCGGCGCCACAGGTGCGGTTGCTGGCGCCTATATCGCCGACATCACCGATGGGGAAGATCGGGCTCGCCACTTCGGGCTCATGAGCGCTTGTTTCGGCGTGGGTATGGTGGCAGGCCCCGTGGCCGGGGGACTGTTGGGCGCCATCTCCTTGCATGCACCATTCCTTGCGGCGGCGGTGCTCAACGGCCTCAACCTACTACTGGGCTGCTTCCTAATGCAGGAGTCGCATAAGGGAGAGCGTCGACCGATGCCCTTGAGAGCCTTCAACCCAGTCAGCTCCTTCCGGTGGGCGCGGGGCATGACTATCGTCGCCGCACTTATGACTGTCTTCTTTATCATGCAACTCGTAGGACAGGTGCCGGCAGCGCTCTGGGTCATTTTCGGCGAGGACCGCTTTCGCTGGAGCGCGACGATGATCGGCCTGTCGCTTGCGGTATTCGGAATCTTGCACGCCCTCGCTCAAGCCTTCGTCACTGGTCCCGCCACCAAACGTTTCGGCGAGAAGCAGGCCATTATCGCCGGCATGGCGGCCGACGCGCTGGGCTACGTCTTGCTGGCGTTCGCGACGCGAGGCTGGATGGCCTTCCCCATTATGATTCTTCTCGCTTCCGGCGGCATCGGGATGCCCGCGTTGCAGGCCATGCTGTCCAGGCAGGTAGATGACGACCATCAGGGACAGCTTCAAGGATCGCTCGCGGCTCTTACCAGCCTAACTTCGATCACTGGACCGCTGATCGTCACGGCGATTTATGCCGCCTCGGCGAGCACATGGAACGGGTTGGCATGGATTGTAGGCGCCGCCCTATACCTTGTCTGCCTCCCCGCGTTGCGTCGCGGTGCATGGAGCCGGGCCACCTCGACCTGAATGGAAGCCGGCGGCACCTCGCTAACGGATTCACCACTCCAAGAATTGGAGCCAATCAATTCTTGCGGAGAACTGTGAATGCGCAAACCAACCCTTGGCAGAACATATCCATCGCGTCCGCCATCTCCAGCAGCCGCACGCGGCGCATCTCGGGCAGCGTTGGGTCCTGGCCACGGGTGCGCATGATCGTGCTCCTGTCGTTGAGGACCCGGCTAGGCTGGCGGGGTTGCCTTACTGGTTAGCAGAATGAATCACCGATACGCGAGCGAACGTGAAGCGACTGCTGCTGCAAAACGTCTGCGACCTGAGCAACAACATGAATGGTCTTCGGTTTCCGTGTTTCGTAAAGTCTGGAAACGCGGAAGTCAGCGCCCTGCACCATTATGTTCCGGATCTGCATCGCAGGATGCTGCTGGCTACCCTGTGGAACACCTACATCTGTATTAACGAAGCGCTGGCATTGACCCTGAGTGATTTTTCTCTGGTCCCGCCGCATCCATACCGCCAGTTGTTTACCCTCACAACGTTCCAGTAACCGGGCATGTTCATCATCAGTAACCCGTATCGTGAGCATCCTCTCTCGTTTCATCGGTATCATTACCCCCATGAACAGAAATCCCCCTTACACGGAGGCATCAGTGACCAAACAGGAAAAAACCGCCCTTAACATGGCCCGCTTTATCAGAAGCCAGACATTAACGCTTCTGGAGAAACTCAACGAGCTGGACGCGGATGAACAGGCAGACATCTGTGAATCGCTTCACGACCACGCTGATGAGCTTTACCGCAGCTGCCTCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCGCAGCCATGACCCAGTCACGTAGCGATAGCGGAGTGTATACTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTGCAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAACACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTCTTCAAGAA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pBR322.gbk	Tue Mar 27 09:44:18 2018 -0400
@@ -0,0 +1,474 @@
+LOCUS       SYNPBR322               4361 bp    DNA     circular SYN 30-SEP-2008
+DEFINITION  Cloning vector pBR322, complete sequence.
+ACCESSION   J01749 K00005 L08654 M10282 M10283 M10286 M10356 M10784 M10785
+            M10786 M33694 V01119
+VERSION     J01749.1
+KEYWORDS    ampicillin resistance; beta-lactamase; cloning vector; drug
+            resistance protein; origin of replication; plasmid; tetracycline
+            resistance.
+SOURCE      Cloning vector pBR322
+  ORGANISM  Cloning vector pBR322
+            other sequences; artificial sequences; vectors.
+REFERENCE   1  (bases 1 to 3; 3259 to 4361)
+  AUTHORS   Sutcliffe,J.G.
+  TITLE     Nucleotide sequence of the ampicillin resistance gene of
+            Escherichia coli plasmid pBR322
+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 75 (8), 3737-3741 (1978)
+   PUBMED   358200
+REFERENCE   2  (bases 1 to 4361)
+  AUTHORS   Sutcliffe,J.G.
+  TITLE     Complete nucleotide sequence of the Escherichia coli plasmid pBR322
+  JOURNAL   Cold Spring Harb. Symp. Quant. Biol. 43 (Pt 1), 77-90 (1979)
+   PUBMED   383387
+REFERENCE   3  (bases 1500 to 2300)
+  AUTHORS   Reed,R.R., Young,R.A., Steitz,J.A., Grindley,N.D. and Guyer,M.S.
+  TITLE     Transposition of the Escherichia coli insertion element gamma
+            generates a five-base-pair repeat
+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 76 (10), 4882-4886 (1979)
+   PUBMED   388421
+REFERENCE   4  (bases 2207 to 2265)
+  AUTHORS   Covarrubias,L., Cervantes,L., Covarrubias,A., Soberon,X.,
+            Vichido,I., Blanco,A., Kupersztoch-Portnoy,Y.M. and Bolivar,F.
+  TITLE     Construction and characterization of new cloning vehicles. V.
+            Mobilization and coding properties of pBR322 and several deletion
+            derivatives including pBR327 and pBR328
+  JOURNAL   Gene 13 (1), 25-35 (1981)
+   PUBMED   6263753
+REFERENCE   5  (bases 2000 to 2500)
+  AUTHORS   Marians,K.J., Soeller,W. and Zipursky,S.L.
+  TITLE     Maximal limits of the Escherichia coli replication factor Y
+            effector site sequences in pBR322 DNA
+  JOURNAL   J. Biol. Chem. 257 (10), 5656-5662 (1982)
+   PUBMED   6279609
+REFERENCE   6  (bases 1 to 80; 4151 to 4229; 4349 to 4361)
+  AUTHORS   Brosius,J., Cate,R.L. and Perlmutter,A.P.
+  TITLE     Precise location of two promoters for the beta-lactamase gene of
+            pBR322. S1 mapping of ribonucleic acid isolated from Escherichia
+            coli or synthesized in vitro
+  JOURNAL   J. Biol. Chem. 257 (15), 9205-9210 (1982)
+   PUBMED   6178738
+REFERENCE   7  (bases 4241 to 4343)
+  AUTHORS   Van Dyke,M.W., Hertzberg,R.P. and Dervan,P.B.
+  TITLE     Map of distamycin, netropsin, and actinomycin binding sites on
+            heterogeneous DNA: DNA cleavage-inhibition patterns with
+            methidiumpropyl-EDTA.Fe(II)
+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 79 (18), 5470-5474 (1982)
+   PUBMED   6291045
+REFERENCE   8  (bases 584 to 709)
+  AUTHORS   Peden,K.W. and Nathans,D.
+  TITLE     Local mutagenesis within deletion loops of DNA heteroduplexes
+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 79 (23), 7214-7217 (1982)
+   PUBMED   6760191
+REFERENCE   9  (bases 373 to 649)
+  AUTHORS   Peden,K.W.
+  TITLE     Revised sequence of the tetracycline-resistance gene of pBR322
+  JOURNAL   Gene 22 (2-3), 277-280 (1983)
+   PUBMED   6307828
+REFERENCE   10 (bases 132 to 181)
+  AUTHORS   Watabe,H., Iino,T., Kaneko,T., Shibata,T. and Ando,T.
+  TITLE     A new class of site-specific endodeoxyribonucleases. Endo.Sce I
+            isolated from a eukaryote, Saccharomyces cerevisiae
+  JOURNAL   J. Biol. Chem. 258 (8), 4663-4665 (1983)
+   PUBMED   6300094
+REFERENCE   11 (bases 368 to 581)
+  AUTHORS   Livneh,Z.
+  TITLE     Directed mutagenesis method for analysis of mutagen specificity:
+            application to ultraviolet-induced mutagenesis
+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 80 (1), 237-241 (1983)
+   PUBMED   6337373
+REFERENCE   12 (bases 2627 to 2682; 2781 to 2828)
+  AUTHORS   Mascharak,P.K., Sugiura,Y., Kuwahara,J., Suzuki,T. and Lippard,S.J.
+  TITLE     Alteration and activation of sequence-specific cleavage of DNA by
+            bleomycin in the presence of the antitumor drug
+            cis-diamminedichloroplatinum(II)
+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 80 (22), 6795-6798 (1983)
+   PUBMED   6196777
+REFERENCE   13 (bases 4276 to 4336)
+  AUTHORS   Schultz,P.G. and Dervan,P.B.
+  TITLE     Sequence-specific double-strand cleavage of DNA by
+            penta-N-methylpyrrolecarboxamide-EDTA X Fe(II)
+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 80 (22), 6834-6837 (1983)
+   PUBMED   6417654
+REFERENCE   14 (bases 518 to 528)
+  AUTHORS   Sutcliffe,J.G.
+  JOURNAL   Unpublished
+REFERENCE   15 (bases 2395 to 2495)
+  AUTHORS   Fuller,R.S., Funnell,B.E. and Kornberg,A.
+  TITLE     The dnaA protein complex with the E. coli chromosomal replication
+            origin (oriC) and other DNA sites
+  JOURNAL   Cell 38 (3), 889-900 (1984)
+   PUBMED   6091903
+REFERENCE   16 (bases 2729 to 2731)
+  AUTHORS   Lathe,R., Kieny,M.P., Skory,S. and Lecocq,J.P.
+  TITLE     Linker tailing: unphosphorylated linker oligonucleotides for
+            joining DNA termini
+  JOURNAL   DNA 3 (2), 173-182 (1984)
+   PUBMED   6327214
+REFERENCE   17 (bases 2729 to 2730)
+  AUTHORS   Heusterspreute,M. and Davison,J.
+  TITLE     Restriction site bank vectors. II. DNA sequence analysis of plasmid
+            pJRD158
+  JOURNAL   DNA 3 (3), 259-268 (1984)
+   PUBMED   6086259
+REFERENCE   18 (bases 2113 to 2186; 2348 to 2415)
+  AUTHORS   Abarzua,P., Soeller,W. and Marians,K.J.
+  TITLE     Mutational analysis of primosome assembly sites. I. Distinct
+            classes of mutants in the pBR322 Escherichia coli factor Y DNA
+            effector sequences
+  JOURNAL   J. Biol. Chem. 259 (22), 14286-14292 (1984)
+   PUBMED   6209275
+REFERENCE   19 (bases 2348 to 2415)
+  AUTHORS   Soeller,W., Abarzua,P. and Marians,K.J.
+  TITLE     Mutational analysis of primosome assembly sites. II. Role of
+            secondary structure in the formation of active sites
+  JOURNAL   J. Biol. Chem. 259 (22), 14293-14300 (1984)
+   PUBMED   6150042
+REFERENCE   20 (bases 1 to 4361)
+  AUTHORS   Van Dyke,M.M. and Dervan,P.B.
+  TITLE     Echinomycin binding sites on DNA
+  JOURNAL   Science 225 (4667), 1122-1127 (1984)
+   PUBMED   6089341
+REFERENCE   21 (sites)
+  AUTHORS   Pouwels,P.H., Enger-Valk,B.E. and Brammar,W.J.
+  JOURNAL   (in) CLONING VECTORS. Elsevier Scientific Publishing, Amsterdam
+            (1985)
+  REMARK    Vector I-A-iv-1
+REFERENCE   22 (bases 1 to 4361)
+  AUTHORS   Watson,N.
+  TITLE     A new revision of the sequence of plasmid pBR322
+  JOURNAL   Gene 70 (2), 399-403 (1988)
+   PUBMED   3063608
+REFERENCE   23 (sites)
+  AUTHORS   Gilbert,W.
+  TITLE     Obtained from VecBase 3.0
+  JOURNAL   Unpublished
+COMMENT     On or before Apr 4, 2002 this sequence version replaced L08654.1,
+            gi:58257.
+            The circular sequence is numbered such that 0 is the middle of the
+            unique EcoRI site and the count increases first through the tet
+            genes, the pMB1 material, and finally through the Tn3 region.
+            Plasmid pBR322 contains ampicillin and tetracycline resistance
+            genes.  The ampicillin resistance gene (amp-r) is a penicillin
+            beta-lactamase. Promoters P1 and P3 are for the beta-lactamase
+            gene. P3 is the natural promoter, and P1 is artificially created by
+            the ligation of two different DNA fragments to create pBR322. P2 is
+            in the same region as P1, but it is on the opposite strand and
+            initiates transcription in the direction of the tetracycline
+            resistance gene.
+            Mutational studies in the primosome assembly sites indicate four
+            types of mutations:  Class I having no effect on the activities
+            elicited by the DNA site and the bases involved are probably
+            spacers; Class II requiring higher Mg-2+ concentrations than the
+            wild-type to be fully activated as factor Y ATPase effectors; Class
+            III co-inactivating both the ATPase effector and DNA replication
+            template activity of the site, indicating that they probably
+            represent essential contact points between factor Y and the DNA;
+            Class IV having a replication template activity intermediate that
+            of class III and class II mutant DNAs.
+            Specific sites within or near the origins of replication are
+            recognized by dnaA protein.  Without dnaA binding to the origin of
+            replication chromosomal replication is not possible (15). pBR322
+            DNA contains two separate regions on opposite strands and close to
+            the origin of replication which, when in single-stranded form, can
+            act as effectors for the ATPase activity of E.coli replication
+            factor Y (5).  Small fragments of DNA containing these sites when
+            cloned in an f1 phage vector act as origins of DNA replication
+            allowing the formation of complementary double-stranded DNA in
+            rifampicin-resistant, dna(B,G,C)-dependent fashion in vitro (5).
+            The biological activity of echinomycin is thought to be related to
+            the formation of complexes by intercalating with cellular DNA (20).
+            Complete source information:
+            Plasmid pBR322 from E.coli (2),(1),(3),(6),(11),(8),(5),(7),(12),
+            (13),(10),(9),(14),(18),(19),(15),(20),(16); pBR322 DNA in pXf3
+            (4).
+            The following data and their annotation were supplied by Will
+            Gilbert under the auspices of the Curator Program.
+             CROSSREFERENCE
+                           #parent
+                             GenBank(50):pSC101C, GenBank(50):Trn3
+                           #offspring
+                             VecBase(3):pBR325, VecBase(3):pBR327,
+            VecBase(3):pBR328,
+                             VecBase(3):pAT153, VecBase(3):pUC7,
+            VecBase(3):pJRD158,
+                             VecBase(3):PiVX, VecBase(3):PiAN7,
+            VecBase(3):pSP64,
+                             VecBase(3):pSP65, VecBase(3):pGEM1,
+            VecBase(3):pGEM2,
+                             VecBase(3):pGEM3, VecBase(3):pGEM4,
+            VecBase(3):pKK223,
+                             VecBase(3):pLBU3, VecBase(3):pTrS3,
+            VecBase(3):pRSVNeo,
+                             VecBase(3):pSV2Cat, VecBase(3):M13mp9,
+            VecBase(3):pHC79,
+                             VecBase(3):pV34, VecBase(3):pKTH601,
+            VecBase(3):pKTH604,
+                             VecBase(3):pKTH605, VecBase(3):pKTH606,
+            VecBase(3):YEp24,
+                             VecBase(3):YIp5, VecBase(3):YRp17,
+            VecBase(3):pSP18,
+                        VecBase(3):pSP19,
+                             VecBase(3):pSP6T3, VecBase(3):pSP6T719,
+            VecBase(3):pT712,
+                             VecBase(3):pT713, VecBase(3):pT7T318,
+            VecBase(3):pT7T319,
+                             VecBase(3):pT7T3A18, VecBase(3):pT7T3A19,
+            VecBase(3):pEX1,
+                             VecBase(3):pEX2, VecBase(3):pEX3,
+            VecBase(3):pCKSP6,
+                             VecBase(3):pACYC177, VecBase(3):pKO1,
+            VecBase(3):pKO2,
+                        VecBase(3):pKM1,
+                             VecBase(3):pKM2, VecBase(3):pMBL1,
+            VecBase(3):pMBL604,
+                             VecBase(3):pMC1511, VecBase(3):pMC1871,
+            VecBase(3):pAA37X,
+                             VecBase(3):pUR278, VecBase(3):pUR288,
+            VecBase(3):pUR289,
+                             VecBase(3):pUR290, VecBase(3):pUR291,
+            VecBase(3):pUR292,
+                             VecBase(3):pUR222.
+FEATURES             Location/Qualifiers
+     source          1..4361
+                     /organism="Cloning vector pBR322"
+                     /mol_type="other DNA"
+                     /db_xref="taxon:47470"
+                     /tissue_lib="ATCC 31344, ATCC 37017"
+                     /focus
+     source          1..1762
+                     /organism="Plasmid pSC101"
+                     /mol_type="other DNA"
+                     /db_xref="taxon:2625"
+     misc_binding    24..27
+                     /bound_moiety="echinomycin"
+     regulatory      complement(27..33)
+                     /regulatory_class="promoter"
+                     /note="promoter P1 (6)"
+     misc_binding    39..42
+                     /bound_moiety="echinomycin"
+     regulatory      43..49
+                     /regulatory_class="promoter"
+                     /note="promoter P2 (6)"
+     misc_binding    53..56
+                     /bound_moiety="echinomycin"
+     misc_binding    67..70
+                     /bound_moiety="echinomycin"
+     misc_binding    80..83
+                     /bound_moiety="echinomycin"
+     gene            86..1276
+                     /gene="tet"
+     CDS             86..1276
+                     /gene="tet"
+                     /codon_start=1
+                     /transl_table=11
+                     /product="tetracycline resistance protein"
+                     /protein_id="AAB59735.1"
+                     /translation="MKSNNALIVILGTVTLDAVGIGLVMPVLPGLLRDIVHSDSIASH
+                     YGVLLALYALMQFLCAPVLGALSDRFGRRPVLLASLLGATIDYAIMATTPVLWILYAG
+                     RIVAGITGATGAVAGAYIADITDGEDRARHFGLMSACFGVGMVAGPVAGGLLGAISLH
+                     APFLAAAVLNGLNLLLGCFLMQESHKGERRPMPLRAFNPVSSFRWARGMTIVAALMTV
+                     FFIMQLVGQVPAALWVIFGEDRFRWSATMIGLSLAVFGILHALAQAFVTGPATKRFGE
+                     KQAIIAGMAADALGYVLLAFATRGWMAFPIMILLASGGIGMPALQAMLSRQVDDDHQG
+                     QLQGSLAALTSLTSITGPLIVTAIYAASASTWNGLAWIVGAALYLVCLPALRRGAWSR
+                     ATST"
+     misc_feature    complement(141..142)
+                     /note="Endo.Sce I cleavage site coordinated with site at
+                     base 146 (10)"
+     misc_feature    146..147
+                     /gene="tet"
+                     /note="Endo.Sce I cleavage site coordinated with site at
+                     base 142 (10)"
+     misc_binding    411..414
+                     /gene="tet"
+                     /bound_moiety="echinomycin"
+     misc_difference 426
+                     /gene="tet"
+                     /note="conflict"
+                     /citation=[11]
+                     /replace=""
+     misc_binding    469..472
+                     /gene="tet"
+                     /bound_moiety="echinomycin"
+     old_sequence    526..528
+                     /gene="tet"
+                     /citation=[17]
+     repeat_region   complement(1515..1519)
+                     /note="gamma-delta insertion target sequence"
+                     /rpt_type=direct
+     misc_feature    1636..1762
+                     /note="from pSC101 (bp 1860-1986)"
+     repeat_region   complement(1788..1792)
+                     /note="gamma-delta insertion target sequence"
+                     /rpt_type=direct
+     misc_difference 1891..1892
+                     /note="conflict"
+                     /citation=[23]
+                     /replace="att"
+     old_sequence    1892..1893
+                     /citation=[2]
+                     /citation=[22]
+     regulatory      1905..1910
+                     /regulatory_class="ribosome_binding_site"
+     regulatory      1905..1909
+                     /regulatory_class="ribosome_binding_site"
+                     /note="Shine-Dalgarno sequence"
+     misc_difference 1913..1914
+                     /note="conflict"
+                     /citation=[23]
+                     /replace="caa"
+     old_sequence    1914..1915
+                     /citation=[17]
+     CDS             1915..2106
+                     /codon_start=1
+                     /transl_table=11
+                     /product="ROP protein"
+                     /protein_id="AAB59736.1"
+                     /translation="MTKQEKTALNMARFIRSQTLTLLEKLNELDADEQADICESLHDH
+                     ADELYRSCLARFGDDGENL"
+     misc_feature    2011..2167
+                     /note="H-strand Y effector site"
+                     /citation=[5]
+     repeat_region   complement(2245..2249)
+                     /note="gamma-delta insertion target sequence"
+                     /rpt_type=direct
+     misc_feature    complement(2351..2414)
+                     /note="L-strand Y effector site"
+                     /citation=[5]
+     misc_binding    2439..2447
+                     /bound_moiety="dnaA"
+     rep_origin      2535
+     old_sequence    2729..2730
+                     /note="revision according to (17)"
+                     /citation=[2]
+                     /citation=[17]
+                     /replace="at"
+     old_sequence    2729
+                     /citation=[17]
+     old_sequence    2730
+                     /note="revision according to (16)"
+                     /citation=[2]
+                     /citation=[16]
+                     /citation=[17]
+                     /replace="t"
+     mobile_element  3148..4361
+                     /mobile_element_type="transposon:Tn3"
+     repeat_region   3148..3185
+                     /note="corresponds to one of the 38bp repeats found in Tn3
+                     (bp 1-38 and complement (4920-4957))"
+                     /rpt_type=inverted
+     gene            complement(3293..4153)
+                     /gene="bla"
+     CDS             complement(3293..4153)
+                     /gene="bla"
+                     /note="E-286"
+                     /codon_start=1
+                     /transl_table=11
+                     /product="beta-lactamase"
+                     /protein_id="AAB59737.1"
+                     /translation="MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGY
+                     IELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVE
+                     YSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRL
+                     DRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPL
+                     LRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIA
+                     EIGASLIKHW"
+     sig_peptide     complement(4085..4153)
+                     /gene="bla"
+     mat_peptide     complement(3296..4084)
+                     /gene="bla"
+                     /product="beta-lactamase"
+     regulatory      complement(4161..4165)
+                     /regulatory_class="ribosome_binding_site"
+                     /note="Shine-Dalgarno sequence"
+     regulatory      complement(4188..4194)
+                     /regulatory_class="promoter"
+                     /note="promoter P3 (6)"
+     misc_binding    complement(4268..4271)
+                     /bound_moiety="echinomycin"
+     misc_binding    complement(4280..4283)
+                     /bound_moiety="echinomycin"
+     misc_binding    complement(4285..4288)
+                     /bound_moiety="echinomycin"
+     misc_binding    complement(4296..4299)
+                     /bound_moiety="echinomycin"
+     misc_binding    complement(4311..4314)
+                     /bound_moiety="echinomycin"
+     misc_binding    complement(4317..4320)
+                     /bound_moiety="echinomycin"
+     misc_binding    complement(4331..4334)
+                     /bound_moiety="echinomycin"
+ORIGIN      
+        1 ttctcatgtt tgacagctta tcatcgataa gctttaatgc ggtagtttat cacagttaaa
+       61 ttgctaacgc agtcaggcac cgtgtatgaa atctaacaat gcgctcatcg tcatcctcgg
+      121 caccgtcacc ctggatgctg taggcatagg cttggttatg ccggtactgc cgggcctctt
+      181 gcgggatatc gtccattccg acagcatcgc cagtcactat ggcgtgctgc tagcgctata
+      241 tgcgttgatg caatttctat gcgcacccgt tctcggagca ctgtccgacc gctttggccg
+      301 ccgcccagtc ctgctcgctt cgctacttgg agccactatc gactacgcga tcatggcgac
+      361 cacacccgtc ctgtggatcc tctacgccgg acgcatcgtg gccggcatca ccggcgccac
+      421 aggtgcggtt gctggcgcct atatcgccga catcaccgat ggggaagatc gggctcgcca
+      481 cttcgggctc atgagcgctt gtttcggcgt gggtatggtg gcaggccccg tggccggggg
+      541 actgttgggc gccatctcct tgcatgcacc attccttgcg gcggcggtgc tcaacggcct
+      601 caacctacta ctgggctgct tcctaatgca ggagtcgcat aagggagagc gtcgaccgat
+      661 gcccttgaga gccttcaacc cagtcagctc cttccggtgg gcgcggggca tgactatcgt
+      721 cgccgcactt atgactgtct tctttatcat gcaactcgta ggacaggtgc cggcagcgct
+      781 ctgggtcatt ttcggcgagg accgctttcg ctggagcgcg acgatgatcg gcctgtcgct
+      841 tgcggtattc ggaatcttgc acgccctcgc tcaagccttc gtcactggtc ccgccaccaa
+      901 acgtttcggc gagaagcagg ccattatcgc cggcatggcg gccgacgcgc tgggctacgt
+      961 cttgctggcg ttcgcgacgc gaggctggat ggccttcccc attatgattc ttctcgcttc
+     1021 cggcggcatc gggatgcccg cgttgcaggc catgctgtcc aggcaggtag atgacgacca
+     1081 tcagggacag cttcaaggat cgctcgcggc tcttaccagc ctaacttcga tcactggacc
+     1141 gctgatcgtc acggcgattt atgccgcctc ggcgagcaca tggaacgggt tggcatggat
+     1201 tgtaggcgcc gccctatacc ttgtctgcct ccccgcgttg cgtcgcggtg catggagccg
+     1261 ggccacctcg acctgaatgg aagccggcgg cacctcgcta acggattcac cactccaaga
+     1321 attggagcca atcaattctt gcggagaact gtgaatgcgc aaaccaaccc ttggcagaac
+     1381 atatccatcg cgtccgccat ctccagcagc cgcacgcggc gcatctcggg cagcgttggg
+     1441 tcctggccac gggtgcgcat gatcgtgctc ctgtcgttga ggacccggct aggctggcgg
+     1501 ggttgcctta ctggttagca gaatgaatca ccgatacgcg agcgaacgtg aagcgactgc
+     1561 tgctgcaaaa cgtctgcgac ctgagcaaca acatgaatgg tcttcggttt ccgtgtttcg
+     1621 taaagtctgg aaacgcggaa gtcagcgccc tgcaccatta tgttccggat ctgcatcgca
+     1681 ggatgctgct ggctaccctg tggaacacct acatctgtat taacgaagcg ctggcattga
+     1741 ccctgagtga tttttctctg gtcccgccgc atccataccg ccagttgttt accctcacaa
+     1801 cgttccagta accgggcatg ttcatcatca gtaacccgta tcgtgagcat cctctctcgt
+     1861 ttcatcggta tcattacccc catgaacaga aatccccctt acacggaggc atcagtgacc
+     1921 aaacaggaaa aaaccgccct taacatggcc cgctttatca gaagccagac attaacgctt
+     1981 ctggagaaac tcaacgagct ggacgcggat gaacaggcag acatctgtga atcgcttcac
+     2041 gaccacgctg atgagcttta ccgcagctgc ctcgcgcgtt tcggtgatga cggtgaaaac
+     2101 ctctgacaca tgcagctccc ggagacggtc acagcttgtc tgtaagcgga tgccgggagc
+     2161 agacaagccc gtcagggcgc gtcagcgggt gttggcgggt gtcggggcgc agccatgacc
+     2221 cagtcacgta gcgatagcgg agtgtatact ggcttaacta tgcggcatca gagcagattg
+     2281 tactgagagt gcaccatatg cggtgtgaaa taccgcacag atgcgtaagg agaaaatacc
+     2341 gcatcaggcg ctcttccgct tcctcgctca ctgactcgct gcgctcggtc gttcggctgc
+     2401 ggcgagcggt atcagctcac tcaaaggcgg taatacggtt atccacagaa tcaggggata
+     2461 acgcaggaaa gaacatgtga gcaaaaggcc agcaaaaggc caggaaccgt aaaaaggccg
+     2521 cgttgctggc gtttttccat aggctccgcc cccctgacga gcatcacaaa aatcgacgct
+     2581 caagtcagag gtggcgaaac ccgacaggac tataaagata ccaggcgttt ccccctggaa
+     2641 gctccctcgt gcgctctcct gttccgaccc tgccgcttac cggatacctg tccgcctttc
+     2701 tcccttcggg aagcgtggcg ctttctcata gctcacgctg taggtatctc agttcggtgt
+     2761 aggtcgttcg ctccaagctg ggctgtgtgc acgaaccccc cgttcagccc gaccgctgcg
+     2821 ccttatccgg taactatcgt cttgagtcca acccggtaag acacgactta tcgccactgg
+     2881 cagcagccac tggtaacagg attagcagag cgaggtatgt aggcggtgct acagagttct
+     2941 tgaagtggtg gcctaactac ggctacacta gaaggacagt atttggtatc tgcgctctgc
+     3001 tgaagccagt taccttcgga aaaagagttg gtagctcttg atccggcaaa caaaccaccg
+     3061 ctggtagcgg tggttttttt gtttgcaagc agcagattac gcgcagaaaa aaaggatctc
+     3121 aagaagatcc tttgatcttt tctacggggt ctgacgctca gtggaacgaa aactcacgtt
+     3181 aagggatttt ggtcatgaga ttatcaaaaa ggatcttcac ctagatcctt ttaaattaaa
+     3241 aatgaagttt taaatcaatc taaagtatat atgagtaaac ttggtctgac agttaccaat
+     3301 gcttaatcag tgaggcacct atctcagcga tctgtctatt tcgttcatcc atagttgcct
+     3361 gactccccgt cgtgtagata actacgatac gggagggctt accatctggc cccagtgctg
+     3421 caatgatacc gcgagaccca cgctcaccgg ctccagattt atcagcaata aaccagccag
+     3481 ccggaagggc cgagcgcaga agtggtcctg caactttatc cgcctccatc cagtctatta
+     3541 attgttgccg ggaagctaga gtaagtagtt cgccagttaa tagtttgcgc aacgttgttg
+     3601 ccattgctgc aggcatcgtg gtgtcacgct cgtcgtttgg tatggcttca ttcagctccg
+     3661 gttcccaacg atcaaggcga gttacatgat cccccatgtt gtgcaaaaaa gcggttagct
+     3721 ccttcggtcc tccgatcgtt gtcagaagta agttggccgc agtgttatca ctcatggtta
+     3781 tggcagcact gcataattct cttactgtca tgccatccgt aagatgcttt tctgtgactg
+     3841 gtgagtactc aaccaagtca ttctgagaat agtgtatgcg gcgaccgagt tgctcttgcc
+     3901 cggcgtcaac acgggataat accgcgccac atagcagaac tttaaaagtg ctcatcattg
+     3961 gaaaacgttc ttcggggcga aaactctcaa ggatcttacc gctgttgaga tccagttcga
+     4021 tgtaacccac tcgtgcaccc aactgatctt cagcatcttt tactttcacc agcgtttctg
+     4081 ggtgagcaaa aacaggaagg caaaatgccg caaaaaaggg aataagggcg acacggaaat
+     4141 gttgaatact catactcttc ctttttcaat attattgaag catttatcag ggttattgtc
+     4201 tcatgagcgg atacatattt gaatgtattt agaaaaataa acaaataggg gttccgcgca
+     4261 catttccccg aaaagtgcca cctgacgtct aagaaaccat tattatcatg acattaacct
+     4321 ataaaaatag gcgtatcacg aggccctttc gtcttcaaga a
+//
+