Repository 'snpeff'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/snpeff

Changeset 10:5b4ac70948d2 (2018-03-27)
Previous changeset 9:68693743661e (2017-11-14) Next changeset 11:bfa6c1b8a03c (2018-03-27)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
modified:
snpEff.xml
snpEff_databases.xml
snpEff_download.xml
snpEff_macros.xml
added:
gbk2fa.py
snpEff_create_db.xml
test-data/input.gbk
test-data/input.gbk.gz
test-data/output_nover.fna
test-data/output_ver.fna
test-data/pBR322.fna
test-data/pBR322.gbk
b
diff -r 68693743661e -r 5b4ac70948d2 gbk2fa.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk2fa.py Tue Mar 27 09:44:18 2018 -0400
[
@@ -0,0 +1,43 @@
+import argparse
+import bz2
+import contextlib
+import gzip
+import sys
+
+import magic
+from Bio import SeqIO
+
+parser = argparse.ArgumentParser()
+parser.add_argument("genbank_file", help="GenBank input file. Can be compressed with gzip or bzip2")
+parser.add_argument("fasta_file", help="FASTA output datset")
+parser.add_argument("--remove_version", dest="remove_version", action="store_true", help="Remove version number from NCBI form formatted accession numbers. For example, this would convert 'B000657.2' to 'B000657'")
+args = parser.parse_args()
+
+gbk_filename = args.genbank_file
+fa_filename = args.fasta_file
+
+
+@contextlib.contextmanager
+def get_file_handle(gbk_filename):
+    f_type = magic.from_file(args.genbank_file, mime=True)
+    if f_type == 'text/plain':
+        input_handle = open(gbk_filename, "r")
+    elif f_type == 'application/gzip':
+        input_handle = gzip.open(gbk_filename, "rt")
+    elif f_type == 'application/x-bzip2':
+        input_handle = bz2.open(gbk_filename, "rt")
+    else:
+        sys.exit("Cannot process file of type {}. Only plain, gzip'ed, and bzip2'ed genbank files are accepted ".format(f_type))
+    yield input_handle
+    input_handle.close()
+
+
+with get_file_handle(gbk_filename) as input_handle, open(fa_filename, "w") as output_handle:
+
+    for seq_record in SeqIO.parse(input_handle, "genbank"):
+        if args.remove_version:
+            seq_id = seq_record.id.split('.')[0]
+        else:
+            seq_id = seq_record.id
+        print('Writing FASTA record: {}'.format( seq_id ))
+        output_handle.write(">{}\n{}\n".format(seq_id, seq_record.seq))
b
diff -r 68693743661e -r 5b4ac70948d2 snpEff.xml
--- a/snpEff.xml Tue Nov 14 05:42:51 2017 -0500
+++ b/snpEff.xml Tue Mar 27 09:44:18 2018 -0400
[
b'@@ -1,13 +1,15 @@\n-<tool id="snpEff" name="SnpEff" version="@WRAPPER_VERSION@.1">\n-    <description>Variant effect and annotation</description>\n+<tool id="snpEff" name="SnpEff eff:" version="@wrapper_version@.1">\n+    <description> annotate variants</description>\n     <macros>\n         <import>snpEff_macros.xml</import>\n     </macros>\n-    <expand macro="requirements" />\n+    <requirements>\n+        <expand macro="requirement" />\n+    </requirements>\n     <expand macro="stdio" />\n     <expand macro="version_command" />\n     <command><![CDATA[\n-        snpEff -Xmx8g eff\n+        snpEff @java_options@ eff\n         -i $inputFormat -o ${outputConditional.outputFormat} -upDownStreamLen $udLength\n         #if $spliceSiteSize and str($spliceSiteSize) != \'\':\n           -spliceSiteSize "$spliceSiteSize"\n@@ -72,6 +74,11 @@\n             #end for\n           #end if\n           \'${snpDb.snpeff_db.metadata.genome_version}\'\n+        #elif $snpDb.genomeSrc == \'custom\':\n+            -dataDir \'${snpDb.snpeff_db.extra_files_path}\'\n+            -configOption \'${snpDb.snpeff_db.metadata.genome_version}\'.genome=\'${snpDb.snpeff_db.metadata.genome_version}\'\n+            -configOption \'${snpDb.snpeff_db.metadata.genome_version}\'.codonTable=\'${snpDb.codon_table}\'\n+            \'${snpDb.snpeff_db.metadata.genome_version}\'\n         #else\n           -download\n           \'$snpDb.genome_version\'\n@@ -92,7 +99,7 @@\n         #end if\n     ]]></command>\n     <inputs>\n-        <param name="input" type="data" format="vcf,tabular,pileup,bed" label="Sequence changes (SNPs, MNPs, InDels)"/>\n+        <param name="input" type="data" format="vcf,bed" label="Sequence changes (SNPs, MNPs, InDels)"/>\n \n         <param name="inputFormat" type="select" label="Input format">\n             <option value="vcf" selected="true">VCF</option>\n@@ -116,15 +123,17 @@\n \n         <conditional name="snpDb">\n             <param name="genomeSrc" type="select" label="Genome source">\n-                <option value="cached">Locally installed reference genome</option>\n-                <option value="history">Reference genome from your history</option>\n-                <option value="named">Named on demand</option>\n+                <!-- These options are referenced in the help section of SnpEff download tool. If you change them, change help of SnpEff download as well -->\n+                <option value="cached">Locally installed snpEff database</option>\n+                <option value="history">Downloaded snpEff database in your history</option>\n+                <option value="named">Download on demand</option>\n+                <option value="custom">Custom snpEff database in your history</option>\n             </param>\n             <when value="cached">\n                 <param name="genomeVersion" type="select" label="Genome">\n                     <!--GENOME    DESCRIPTION-->\n                     <options from_data_table="snpeffv_genomedb">\n-                            <filter type="static_value" name="snpeff_version" value="@SNPEFF_VERSION@" column="1"/>\n+                            <filter type="static_value" name="snpeff_version" value="@snpeff_version@" column="1"/>\n                             <filter type="unique_value" column="2" />\n                     </options>\n                 </param>\n@@ -138,11 +147,11 @@\n                 </section>\n             </when>\n             <when value="history">\n-                <param name="snpeff_db" type="data" format="snpeffdb" label="@SNPEFF_VERSION@ Genome Data">\n+                <param name="snpeff_db" type="data" format="snpeffdb" label="@snpeff_version@ Genome Data">\n                     <options options_filter_attribute="metadata.snpeff_version" >\n-                        <filter type="add_value" value="@SNPEFF_VERSION@" />\n+                        <filter type="add_value" value="@snpeff_version@" />\n                     </options>\n-                    <validator type="expression" message="This version of SnpEff will only work with @SNPEFF_VERSION@ '..b'   <help>\n                By default SnpEff simplifies all chromosome names. For instance \'chr1\' is just \'1\'.\n                You can prepend any string you want to the chromosome name\n             </help>\n             <validator type="regex" message="No whitespace allowed">^\\S*$</validator>\n         </param>\n-        <param name="generate_stats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats"/>\n-        <param name="noLog" type="boolean" truevalue="-noLog" falsevalue="" checked="true" label="Suppress reporting usage statistics to server"/>\n+        <param name="generate_stats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats" argument="-noStats"/>\n+        <param name="noLog" type="boolean" truevalue="-noLog" falsevalue="" checked="true" label="Suppress reporting usage statistics to server" argument="-noLog"/>\n     </inputs>\n     <outputs>\n         <data name="snpeff_output" format="vcf">\n@@ -330,9 +374,34 @@\n         </test>\n     </tests>\n     <help><![CDATA[\n-This tool calculate the effect of variants (SNPs/MNPs/Insertions) and deletions.\n+\n+**What it does**\n+\n+SnpEff is a variant annotation and effect prediction tool. It annotates and predicts the effects of genetic variants (such as amino acid changes).\n+\n+A typical SnpEff use case would be:\n+\n+ - **Input**: The inputs are predicted variants (SNPs, insertions, deletions and MNPs). The input file is usually obtained as a result of a sequencing experiment, and it is usually in variant call format (VCF).\n+ - **Output**: SnpEff analyzes the input variants. It annotates the variants and calculates the effects they produce on known genes (e.g. amino acid changes). A list of effects and annotations that SnpEff can calculate can be found here.\n+\n+By genetic variant we mean difference between a genome and a "reference" genome. As an example, imagine we are sequencing a "sample". Here "sample" can mean anything that you are interested in studying, from a cell culture, to a mouse or a cancer patient.  It is a standard procedure to compare your sample sequences against the corresponding "reference genome". For instance you may compare the cancer patient genome against the "reference genome".\n \n-@EXTERNAL_DOCUMENTATION@\n+In a typical sequencing experiment, you will find many places in the genome where your sample differs from the reference genome. These are called "genomic variants" or just "variants". \n+Typically, variants are categorized as follows:\n+\n+ - SNP (Single-Nucleotide Polymorphism) Reference = \'A\', Sample = \'C\'\n+ - Ins (Insertion) Reference = \'A\', Sample = \'AGT\'\n+ - Del (Deletion) Reference = \'AC\', Sample = \'C\'\n+ - MNP (Multiple-nucleotide polymorphism) Reference = \'ATA\', Sample = \'GTC\'\n+ - MIXED (Multiple-nucleotide and an InDel) Reference = \'ATA\', Sample = \'GTCAGT\'\n+\n+This is not a comprehensive list, it is just to give you an idea.\n+\n+Suppose you have a huge file describing all the differences between your sample and the reference genome. But you want to know more about these variants than just their genetic coordinates. E.g.: Are they in a gene? In an exon? Do they change protein coding? Do they cause premature stop codons? SnpEff can help you answer all these questions. The process of adding this information about the variants is called "Annotation". \n+SnpEff provides several degrees of annotations, from simple (e.g. which gene is each variant affecting) to extremely complex annotations (e.g. will this non-coding variant affect the expression of a gene?). It should be noted that the more complex the annotations, the more it relies in computational predictions. Such computational predictions can be incorrect, so results from SnpEff (or any prediction algorithm) cannot be trusted blindly, they must be analyzed and independently validated by corresponding wet-lab experiments.\n+\n+@snpeff_in_galaxy_info@\n+@external_documentation@\n ]]>\n     </help>\n     <expand macro="citations" />\n'
b
diff -r 68693743661e -r 5b4ac70948d2 snpEff_create_db.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/snpEff_create_db.xml Tue Mar 27 09:44:18 2018 -0400
[
@@ -0,0 +1,128 @@
+<tool id="snpEff_build_gb" name="SnpEff build:" version="@wrapper_version@.1">
+    <description> database from Genbank record</description>
+    <macros>
+        <import>snpEff_macros.xml</import>
+    </macros>
+    <requirements>
+        <expand macro="requirement" />
+        <requirement type="package" version="3.6">python</requirement>
+        <requirement type="package" version="1.70">biopython</requirement>
+        <requirement type="package" version="0.4.15">python-magic</requirement>
+        <requirement type="package" version="5.32">libmagic</requirement>
+    </requirements>
+    <expand macro="stdio" />
+    <expand macro="version_command" />
+    <command><![CDATA[
+
+        #if str( $fasta.fasta_selector ) == "yes":
+            python3 '$__tool_directory__/gbk2fa.py' '${input_gbk}' '${output_fasta}'
+            #if $fasta.remove_version:
+                '${fasta.remove_version}'
+            #end if
+            &&
+        #end if
+
+        mkdir -p '${snpeff_output.files_path}'/'${genome_version}' &&
+        
+        ln -s '${input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk &&
+
+        snpEff @java_options@ build -v 
+        -configOption '${genome_version}'.genome='${genome_version}' 
+        -configOption '${genome_version}'.codonTable='${codon_table}'
+        -genbank -dataDir '$snpeff_output.files_path' '$genome_version'
+
+    ]]></command>
+    <inputs>
+        <param name="genome_version" type="text" value="" label="Name for the database" help="for E. coli K12 you may want to use 'EcK12' etc.">
+            <validator type="regex" message="A genome version name is required">\S+</validator>
+        </param>
+        <param name="input_gbk" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/>
+        <param name="codon_table" type="select" label="Select genetic code for this sequence" help="If this sequence uses non-standard genetic code, select one from these options">
+            <option selected="true" value="Standard">Standard</option>
+            <option value="Vertebrate_Mitochondrial">Vertebrate_Mitochondrial</option>
+            <option value="Yeast_Mitochondrial">Yeast_Mitochondrial</option>
+            <option value="Mold_Mitochondrial">Mold_Mitochondrial</option>
+            <option value="Protozoan_Mitochondrial">Protozoan_Mitochondrial</option>
+            <option value="Coelenterate">Coelenterate</option>
+            <option value="Mitochondrial">Mitochondrial</option>
+            <option value="Mycoplasma">Mycoplasma</option>
+            <option value="Spiroplasma">Spiroplasma</option>
+            <option value="Invertebrate_Mitochondrial">Invertebrate_Mitochondrial</option>
+            <option value="Ciliate_Nuclear">Ciliate_Nuclear</option>
+            <option value="Dasycladacean_Nuclear">Dasycladacean_Nuclear</option>
+            <option value="Hexamita_Nuclear">Hexamita_Nuclear</option>
+            <option value="Echinoderm_Mitochondrial">Echinoderm_Mitochondrial</option>
+            <option value="Flatworm_Mitochondrial">Flatworm_Mitochondrial</option>
+            <option value="Euplotid_Nuclear">Euplotid_Nuclear</option>
+            <option value="Bacterial_and_Plant_Plastid">Bacterial_and_Plant_Plastid</option>
+            <option value="Alternative_Yeast_Nuclear">Alternative_Yeast_Nuclear</option>
+            <option value="Ascidian_Mitochondrial">Ascidian_Mitochondrial</option>
+            <option value="Alternative_Flatworm_Mitochondrial">Alternative_Flatworm_Mitochondrial</option>
+            <option value="Blepharisma_Macronuclear">Blepharisma_Macronuclear</option>
+            <option value="Chlorophycean_Mitochondrial">Chlorophycean_Mitochondrial</option>
+            <option value="Trematode_Mitochondrial">Trematode_Mitochondrial</option>
+            <option value="Scenedesmus_obliquus_Mitochondrial">Scenedesmus_obliquus_Mitochondrial</option>
+            <option value="Thraustochytrium_Mitochondrial">Thraustochytrium_Mitochondrial</option>
+        </param>
+        <conditional name="fasta">
+            <param name="fasta_selector" type="select" display="radio" label="Parse Genbank into Fasta" help="This will generate an additional dataset containing all sequences from Genbank file in FASTA format">
+                <option value="yes" selected="true">Yes</option>
+                <option value="no">No</option>
+            </param>
+            <when value="yes">
+                <param type="boolean" name="remove_version" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have vesion numbers such as B000564.2. This option removes them leaving only B000564" argument="--remove_version"/>
+            </when>
+            <when value="no"/>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="snpeff_output" format="snpeffdb" label="@snpeff_version@ database for ${genome_version}"/>
+        <data name="output_fasta" format="fasta" label="Fasta sequecnes for ${genome_version}">
+            <filter>fasta['fasta_selector'] == 'yes'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="genome_version" value="pBR322"/>
+            <param name="input_gbk" value="pBR322.gbk" />
+            <output name="snpeff_output">
+                <assert_contents>
+                    <has_text text="pBR322" />
+                </assert_contents>
+            </output>
+            <output name="output_fasta" value="pBR322.fna"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool uses `"snpEff build -genbank"` command to create a snpEff database from a Genbank dataset. If **Parse Genbank into Fasta** is selected (the default behavior) the tool will also convert Genbank dataset into a single FASTA dataset.
+
+
+Using Genbank data for creating databases has several advantages:
+
+ #. Genbank files contains annotations (such as locations of genes) together with sequences. This was one ensures that these two are in sync with each other
+ #. When you are analyzing small genomes it is much more convenient to create a database on the fly and use it.
+
+-------
+
+.. class:: infomark
+
+**The usage scenario**
+
+Suppose you have a series of Illumina reads from an experiment involving *E. coli* K-12 MG1655. You want to map these reads to the reference genome of K-12 MG1655, call variants, and annotate them using snpEff. This tool enables you to follow the following analysis steps:
+
+ #. Download genome from `NCBI <https://www.ncbi.nlm.nih.gov>`_ into Galaxy.
+ #. Use this tool to generate a snpEff database and FASTA sequences from the file you downloaded at step 1.
+ #. Use your Illumina reads to map against FASTA dataset generated in the previous step using BWA-MEM.
+ #. Call variants using **Freebayes**.
+ #. Annotate vcf output of Freebayes with **SnpEff eff** using database generated at step 2 (using *Custom* option for **Genome source** parameter).
+
+In this scenario Genbank dataset is used twice. First, it is used to produce FASTA sequences that are using by BWA to map against. Second, it is used to create snpEff database. This guarantees that you will not have any issues related to reference sequence naming.
+
+@snpeff_in_galaxy_info@
+@external_documentation@
+]]>
+    </help>
+    <expand macro="citations" />
+</tool>
b
diff -r 68693743661e -r 5b4ac70948d2 snpEff_databases.xml
--- a/snpEff_databases.xml Tue Nov 14 05:42:51 2017 -0500
+++ b/snpEff_databases.xml Tue Mar 27 09:44:18 2018 -0400
[
@@ -1,18 +1,47 @@
-<tool id="snpEff_databases" name="SnpEff available databases" version="@WRAPPER_VERSION@.1">
-    <description></description>
+<tool id="snpEff_databases" name="SnpEff databases:" version="@wrapper_version@.1">
+    <description> list available databases</description>
     <macros>
         <import>snpEff_macros.xml</import>
     </macros>
-    <expand macro="requirements" />
+    <requirements>
+        <expand macro="requirement" />
+    </requirements>
     <expand macro="stdio" />
     <expand macro="version_command" />
     <command><![CDATA[
-snpEff databases | grep -v '^---' | sed 's/^Genome/#Genome/' | sed 's/  *//g' > '$snpeff_dbs'
+
+        snpEff databases | grep -v '^---' | sed 's/^Genome/#Genome/' | sed 's/  *//g' 
+
+        #if $include_pattern:
+             | grep '${include_pattern}' 
+        #end if
+
+        #if $exclude_pattern:
+             | grep -v '${exclude_pattern}' 
+        #end if
+
+        > '${snpeff_dbs}'
+            
     ]]></command>
     <inputs>
+        <param name="include_pattern" size="40" type="text" value="" optional="True" label="List entries matching the following expression" help="Databases matching this expression will be listed. Here you can enter text or regular expression. For example, to show only mouse databases use 'Mouse'. Note that this parameter is case-sensitive.">
+            <sanitizer>
+                <valid initial="string.digits,string.letters">
+                    <add value="^" />
+                </valid>
+            </sanitizer>
+        </param>
+        <param name="exclude_pattern" size="40" type="text" value="" optional="True" label="Do not output entries matching the following expression" help="Databases matching this expression WILL NOT BE listed. Here you can enter text or regular expression. For excample, to avoid all ENSEMBL bundles enter 'ENSEMBL'. Note that this parameter is case-sensitive.">
+            <sanitizer>
+                <valid initial="string.digits,string.letters">
+                    <add value="^" />
+                </valid>
+            </sanitizer>
+        </param>
+
     </inputs>
     <outputs>
-        <data format="tabular" name="snpeff_dbs" label="${tool.name} @SNPEFF_VERSION@ available databases" />
+        <data format="tabular" name="snpeff_dbs" label="${tool.name} @snpeff_version@ available databases" />
     </outputs>
     <tests>
         <test>
@@ -22,9 +51,39 @@
                 </assert_contents>
             </output>
         </test>
+         <test>
+            <param name="include_pattern" value="ebola"/>
+            <output name="snpeff_dbs">
+                <assert_contents>
+                    <has_text text="ebola_zaire" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help><![CDATA[
-@EXTERNAL_DOCUMENTATION@
+
+**What it does**
+
+This tool downloads the master list of snpEff databases from @snpeff_database_url@. You can then look at this list and decide which database to use for your analysis. For example, if **List entries matching the following expression** parameter of this tool is set to *Mouse* the it will produce a tabular dataset with the following content::
+
+    mm10  Mouse  http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm10.zip
+    mm9   Mouse  http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm9.zip
+
+This means that there two available snpEff databases for mouse genome versions mm9 and mm10. In order to download these databases you should use identifier from the first column (e.g., mm9 or mm10 in this case). 
+
+-------
+
+.. class:: infomark
+
+**The usage scenario**
+
+There are two ways to use names of databases obtained with this tool in Galaxy's version on snpEff:
+
+    #. Use **SnpEff download** tool. It will download the database to the history and you will be able to use it in **SnpEff eff** tool using *Downloaded snpEff database in your history* option of the **Genome source** parameter.
+    #. Use *Download on demand* option of the **SnpEff eff** tool (again, **Genome source** parameter). In this case snpEff will download the database before performing annotation. 
+
+@snpeff_in_galaxy_info@
+@external_documentation@
     ]]></help>
     <expand macro="citations" />
 </tool>
b
diff -r 68693743661e -r 5b4ac70948d2 snpEff_download.xml
--- a/snpEff_download.xml Tue Nov 14 05:42:51 2017 -0500
+++ b/snpEff_download.xml Tue Mar 27 09:44:18 2018 -0400
[
@@ -1,22 +1,24 @@
-<tool id="snpEff_download" name="SnpEff Download" version="@WRAPPER_VERSION@.1">
-    <description>Download a new database</description>
+<tool id="snpEff_download" name="SnpEff download:" version="@wrapper_version@.1">
+    <description> download a pre-built database</description>
     <macros>
         <import>snpEff_macros.xml</import>
     </macros>
-    <expand macro="requirements" />
+    <requirements>
+        <expand macro="requirement" />
+    </requirements>
     <expand macro="stdio" />
     <expand macro="version_command" />
     <command><![CDATA[
 snpEff download -dataDir '$snpeff_db.files_path' -v '$genome_version'
     ]]></command>
     <inputs>
-        <param name="genome_version" type="text" value="" label="Select the genome version you want to download (e.g. GRCh38.86, GRCh37.75, hg38, or GRCm38.86)">
-            <help>@SNPEFF_DATABASE_URL@</help>
+        <param name="genome_version" type="text" value="" label="Select the annotation database you want to download (e.g. GRCh38.86, mm10 etc.)" help="The list of available databases can be obtained with 'SnpEff databases' tool">
+            <help>@snpeff_database_url@</help>
             <validator type="regex" message="A genome version name is required">\S+</validator>
         </param>
     </inputs>
     <outputs>
-        <data format="snpeffdb" name="snpeff_db" label="${tool.name} @SNPEFF_VERSION@ ${genome_version}" />
+        <data format="snpeffdb" name="snpeff_db" label="${tool.name} @snpeff_version@ ${genome_version}"/>
     </outputs>
     <tests>
         <test>
@@ -29,7 +31,24 @@
         </test>
     </tests>
     <help><![CDATA[
-@EXTERNAL_DOCUMENTATION@
+
+**What it does**
+
+This tool downloads a specified database from @snpeff_database_url@. It deposits it into the history. 
+
+-------
+
+.. class:: infomark
+
+**The usage scenario**
+
+Suppose you want to annoate a VCF file containing variants within mm10 version of the Mouse genome. To do this you can:
+
+    #. Download mm10 snpEff database by typing *mm10* into **Select the annotation database...** text box.
+    #. Use **SnpEff eff** by choosing the downloaded database from the history using *Downloaded snpEff database in your history* option of the **Genome source** parameter.
+    
+@snpeff_in_galaxy_info@
+@external_documentation@
     ]]></help>
     <expand macro="citations" />
 </tool>
b
diff -r 68693743661e -r 5b4ac70948d2 snpEff_macros.xml
--- a/snpEff_macros.xml Tue Nov 14 05:42:51 2017 -0500
+++ b/snpEff_macros.xml Tue Mar 27 09:44:18 2018 -0400
b
@@ -1,8 +1,6 @@
 <macros>
-    <xml name="requirements">
-        <requirements>
-            <requirement type="package" version="4.3.1r">snpeff</requirement>
-        </requirements>
+    <xml name="requirement">
+        <requirement type="package" version="4.3.1t">snpeff</requirement>
     </xml>
   <xml name="stdio">
     <stdio>
@@ -15,16 +13,61 @@
 snpEff -version
     ]]></version_command>
   </xml>
-  <token name="@WRAPPER_VERSION@">4.3r</token>
-  <token name="@SNPEFF_VERSION@">SnpEff4.3</token>
-  <token name="@SNPEFF_DATABASE_URL@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token>
-  <token name="@EXTERNAL_DOCUMENTATION@">
-For details about this tool, please go to: http://snpeff.sourceforge.net/SnpEff_manual.html
+  <token name="@wrapper_version@">4.3.1t</token>
+  <token name="@snpeff_version@">SnpEff4.3</token>
+  <token name="@snpeff_database_url@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token>
+  <token name="@java_options@">-Xmx\${GALAXY_MEMORY_MB:-8192}m</token>
+  <token name="@external_documentation@">
+
+
+-------
+
+To learn more about snpEff read its manual at http://snpeff.sourceforge.net/SnpEff_manual.html
   </token>
+  <token name="@snpeff_in_galaxy_info@">
+
+-------
+
+.. class:: warningmark
+
+**Using SnpEff in Galaxy: A few points to remember**
+
+SnpEff relies on specially formatted databases to generate annotations. It will not work without them. There are several ways in which these databases can be obtained.
+
+**Pre-cached databases**
+
+Many standard (e.g., human, mouse, *Drosophila*) databases are likely pre-cached within a given Galaxy instance. You should be able to see them listed in **Genome** drop-down of **SbpEff eff** tool. 
+
+In you *do not see them* keep reading...
+
+**Download pre-built databases**
+
+SnpEff project generates large numbers of pre-build databases. These are available at @SNPEFF_DATABASE_URL@ and can downloaded. Follow these steps:
+
+  #. Use **SnpEff databases** tool to generate a list of existing databases. Note the name of the database you need.
+  #. Use **SnpEff download** tool to download the database.
+  #. Finally, use **SnpEff eff** by choosing the downloaded database from the history using *Downloaded snpEff database in your history* option of the **Genome source** parameter.
+
+Alternatively, you can specify the name of the database directly in **SnpEff eff** using the *Download on demand* option (again, **Genome source** parameter). In this case snpEff will download the database before performing annotation. 
+
+**Create your own database**
+
+In cases when you are dealing with bacterial or viral (or, frankly, any other) genomes it may be easier to create database yourself. For this you need:
+
+ #. Download Genbank record corresponding to your genome of interest from from NCBI. 
+ #. Use **SnpEff build** to create the database. 
+ #. Use the database in **SnpEff eff** (using *Custom* option for **Genome source** parameter).
+
+Creating custom database has one benefit. The **SnpEff build** tool normally produces two outputs: (1) a SnpEff database and (2) FASTA file containing sequences from the Genbank file. If you are performing your experiment from the beginning by mapping reads against a genome and finding variants before annotating them with SnpEff you can use **this FASTA file** as a reference to map your reads against. This will guarantee that you will not have any issues related to reference sequence naming -- the most common source of SnpEff errors. 
+
+</token>
+
+
+
   <xml name="citations">
       <citations>
         <citation type="doi">10.4161/fly.19695</citation>
         <yield />
       </citations>
   </xml>
-</macros>
+</macros>
\ No newline at end of file
b
diff -r 68693743661e -r 5b4ac70948d2 test-data/input.gbk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.gbk Tue Mar 27 09:44:18 2018 -0400
b
@@ -0,0 +1,79 @@
+LOCUS       LC129268                2808 bp    DNA     linear   SYN 19-MAR-2016
+DEFINITION  Synthetic construct DNA, plasmid vector pUC18 including artificial
+            sequence.
+ACCESSION   LC129268
+VERSION     LC129268.1
+KEYWORDS    .
+SOURCE      synthetic construct
+  ORGANISM  synthetic construct
+            other sequences; artificial sequences.
+REFERENCE   1
+  AUTHORS   Takahashi,M., Kita,Y., Mizuno,A. and Goto-Yamamoto,N.
+  TITLE     Evaluation of method bias in bacterial community analysis
+  JOURNAL   Unpublished
+REFERENCE   2  (bases 1 to 2808)
+  AUTHORS   Takahashi,M. and Goto-Yamamoto,N.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (03-MAR-2016) Contact:Masayuki Takahashi National
+            Research Institute of Brewing, Technology Development Research
+            Division; 3-7-1 Kagamiyama, Higashi-hiroshima, Hiroshima 739-0046,
+            Japan URL :http://www.nrib.go.jp/index.html
+FEATURES             Location/Qualifiers
+     source          1..2808
+                     /organism="synthetic construct"
+                     /mol_type="other DNA"
+                     /db_xref="taxon:32630"
+                     /note="plasmid vector pUC18 including artificial sequence"
+     misc_feature    439..560
+                     /note="internal standard DNA for quantification of
+                     microbial rDNA using quantitative PCR"
+ORIGIN      
+        1 tcgcgcgttt cggtgatgac ggtgaaaacc tctgacacat gcagctcccg gagacggtca
+       61 cagcttgtct gtaagcggat gccgggagca gacaagcccg tcagggcgcg tcagcgggtg
+      121 ttggcgggtg tcggggctgg cttaactatg cggcatcaga gcagattgta ctgagagtgc
+      181 accatatgcg gtgtgaaata ccgcacagat gcgtaaggag aaaataccgc atcaggcgcc
+      241 attcgccatt caggctgcgc aactgttggg aagggcgatc ggtgcgggcc tcttcgctat
+      301 tacgccagct ggcgaaaggg ggatgtgctg caaggcgatt aagttgggta acgccagggt
+      361 tttcccagtc acgacgttgt aaaacgacgg ccagtgccaa gcttgcatgc ctgcaggtcg
+      421 actctagagg atccccggaa ctaatacgac tcactatagg gtccgatctt ccgaggtctc
+      481 atatcgatcg gtagggcatc taatggcttc ggagttcaag ggctatattc gccatgtcag
+      541 atttgtatgc caaaggccgg gtaccgagct cgaattcgta atcatggtca tagctgtttc
+      601 ctgtgtgaaa ttgttatccg ctcacaattc cacacaacat acgagccgga agcataaagt
+      661 gtaaagcctg gggtgcctaa tgagtgagct aactcacatt aattgcgttg cgctcactgc
+      721 ccgctttcca gtcgggaaac ctgtcgtgcc agctgcatta atgaatcggc caacgcgcgg
+      781 ggagaggcgg tttgcgtatt gggcgctctt ccgcttcctc gctcactgac tcgctgcgct
+      841 cggtcgttcg gctgcggcga gcggtatcag ctcactcaaa ggcggtaata cggttatcca
+      901 cagaatcagg ggataacgca ggaaagaaca tgtgagcaaa aggccagcaa aaggccagga
+      961 accgtaaaaa ggccgcgttg ctggcgtttt tccataggct ccgcccccct gacgagcatc
+     1021 acaaaaatcg acgctcaagt cagaggtggc gaaacccgac aggactataa agataccagg
+     1081 cgtttccccc tggaagctcc ctcgtgcgct ctcctgttcc gaccctgccg cttaccggat
+     1141 acctgtccgc ctttctccct tcgggaagcg tggcgctttc tcatagctca cgctgtaggt
+     1201 atctcagttc ggtgtaggtc gttcgctcca agctgggctg tgtgcacgaa ccccccgttc
+     1261 agcccgaccg ctgcgcctta tccggtaact atcgtcttga gtccaacccg gtaagacacg
+     1321 acttatcgcc actggcagca gccactggta acaggattag cagagcgagg tatgtaggcg
+     1381 gtgctacaga gttcttgaag tggtggccta actacggcta cactagaagg acagtatttg
+     1441 gtatctgcgc tctgctgaag ccagttacct tcggaaaaag agttggtagc tcttgatccg
+     1501 gcaaacaaac caccgctggt agcggtggtt tttttgtttg caagcagcag attacgcgca
+     1561 gaaaaaaagg atctcaagaa gatcctttga tcttttctac ggggtctgac gctcagtgga
+     1621 acgaaaactc acgttaaggg attttggtca tgagattatc aaaaaggatc ttcacctaga
+     1681 tccttttaaa ttaaaaatga agttttaaat caatctaaag tatatatgag taaacttggt
+     1741 ctgacagtta ccaatgctta atcagtgagg cacctatctc agcgatctgt ctatttcgtt
+     1801 catccatagt tgcctgactc cccgtcgtgt agataactac gatacgggag ggcttaccat
+     1861 ctggccccag tgctgcaatg ataccgcgag acccacgctc accggctcca gatttatcag
+     1921 caataaacca gccagccgga agggccgagc gcagaagtgg tcctgcaact ttatccgcct
+     1981 ccatccagtc tattaattgt tgccgggaag ctagagtaag tagttcgcca gttaatagtt
+     2041 tgcgcaacgt tgttgccatt gctacaggca tcgtggtgtc acgctcgtcg tttggtatgg
+     2101 cttcattcag ctccggttcc caacgatcaa ggcgagttac atgatccccc atgttgtgca
+     2161 aaaaagcggt tagctccttc ggtcctccga tcgttgtcag aagtaagttg gccgcagtgt
+     2221 tatcactcat ggttatggca gcactgcata attctcttac tgtcatgcca tccgtaagat
+     2281 gcttttctgt gactggtgag tactcaacca agtcattctg agaatagtgt atgcggcgac
+     2341 cgagttgctc ttgcccggcg tcaatacggg ataataccgc gccacatagc agaactttaa
+     2401 aagtgctcat cattggaaaa cgttcttcgg ggcgaaaact ctcaaggatc ttaccgctgt
+     2461 tgagatccag ttcgatgtaa cccactcgtg cacccaactg atcttcagca tcttttactt
+     2521 tcaccagcgt ttctgggtga gcaaaaacag gaaggcaaaa tgccgcaaaa aagggaataa
+     2581 gggcgacacg gaaatgttga atactcatac tcttcctttt tcaatattat tgaagcattt
+     2641 atcagggtta ttgtctcatg agcggataca tatttgaatg tatttagaaa aataaacaaa
+     2701 taggggttcc gcgcacattt ccccgaaaag tgccacctga cgtctaagaa accattatta
+     2761 tcatgacatt aacctataaa aataggcgta tcacgaggcc ctttcgtc
+//
+
b
diff -r 68693743661e -r 5b4ac70948d2 test-data/input.gbk.gz
b
Binary file test-data/input.gbk.gz has changed
b
diff -r 68693743661e -r 5b4ac70948d2 test-data/output_nover.fna
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_nover.fna Tue Mar 27 09:44:18 2018 -0400
b
@@ -0,0 +1,2 @@
+>LC129268
+TCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGCCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGAACTAATACGACTCACTATAGGGTCCGATCTTCCGAGGTCTCATATCGATCGGTAGGGCATCTAATGGCTTCGGAGTTCAAGGGCTATATTCGCCATGTCAGATTTGTATGCCAAAGGCCGGGTACCGAGCTCGAATTCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTC
b
diff -r 68693743661e -r 5b4ac70948d2 test-data/output_ver.fna
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_ver.fna Tue Mar 27 09:44:18 2018 -0400
b
@@ -0,0 +1,2 @@
+>LC129268.1
+TCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGCCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGAACTAATACGACTCACTATAGGGTCCGATCTTCCGAGGTCTCATATCGATCGGTAGGGCATCTAATGGCTTCGGAGTTCAAGGGCTATATTCGCCATGTCAGATTTGTATGCCAAAGGCCGGGTACCGAGCTCGAATTCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTC
b
diff -r 68693743661e -r 5b4ac70948d2 test-data/pBR322.fna
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pBR322.fna Tue Mar 27 09:44:18 2018 -0400
b
@@ -0,0 +1,2 @@
+>J01749
+TTCTCATGTTTGACAGCTTATCATCGATAAGCTTTAATGCGGTAGTTTATCACAGTTAAATTGCTAACGCAGTCAGGCACCGTGTATGAAATCTAACAATGCGCTCATCGTCATCCTCGGCACCGTCACCCTGGATGCTGTAGGCATAGGCTTGGTTATGCCGGTACTGCCGGGCCTCTTGCGGGATATCGTCCATTCCGACAGCATCGCCAGTCACTATGGCGTGCTGCTAGCGCTATATGCGTTGATGCAATTTCTATGCGCACCCGTTCTCGGAGCACTGTCCGACCGCTTTGGCCGCCGCCCAGTCCTGCTCGCTTCGCTACTTGGAGCCACTATCGACTACGCGATCATGGCGACCACACCCGTCCTGTGGATCCTCTACGCCGGACGCATCGTGGCCGGCATCACCGGCGCCACAGGTGCGGTTGCTGGCGCCTATATCGCCGACATCACCGATGGGGAAGATCGGGCTCGCCACTTCGGGCTCATGAGCGCTTGTTTCGGCGTGGGTATGGTGGCAGGCCCCGTGGCCGGGGGACTGTTGGGCGCCATCTCCTTGCATGCACCATTCCTTGCGGCGGCGGTGCTCAACGGCCTCAACCTACTACTGGGCTGCTTCCTAATGCAGGAGTCGCATAAGGGAGAGCGTCGACCGATGCCCTTGAGAGCCTTCAACCCAGTCAGCTCCTTCCGGTGGGCGCGGGGCATGACTATCGTCGCCGCACTTATGACTGTCTTCTTTATCATGCAACTCGTAGGACAGGTGCCGGCAGCGCTCTGGGTCATTTTCGGCGAGGACCGCTTTCGCTGGAGCGCGACGATGATCGGCCTGTCGCTTGCGGTATTCGGAATCTTGCACGCCCTCGCTCAAGCCTTCGTCACTGGTCCCGCCACCAAACGTTTCGGCGAGAAGCAGGCCATTATCGCCGGCATGGCGGCCGACGCGCTGGGCTACGTCTTGCTGGCGTTCGCGACGCGAGGCTGGATGGCCTTCCCCATTATGATTCTTCTCGCTTCCGGCGGCATCGGGATGCCCGCGTTGCAGGCCATGCTGTCCAGGCAGGTAGATGACGACCATCAGGGACAGCTTCAAGGATCGCTCGCGGCTCTTACCAGCCTAACTTCGATCACTGGACCGCTGATCGTCACGGCGATTTATGCCGCCTCGGCGAGCACATGGAACGGGTTGGCATGGATTGTAGGCGCCGCCCTATACCTTGTCTGCCTCCCCGCGTTGCGTCGCGGTGCATGGAGCCGGGCCACCTCGACCTGAATGGAAGCCGGCGGCACCTCGCTAACGGATTCACCACTCCAAGAATTGGAGCCAATCAATTCTTGCGGAGAACTGTGAATGCGCAAACCAACCCTTGGCAGAACATATCCATCGCGTCCGCCATCTCCAGCAGCCGCACGCGGCGCATCTCGGGCAGCGTTGGGTCCTGGCCACGGGTGCGCATGATCGTGCTCCTGTCGTTGAGGACCCGGCTAGGCTGGCGGGGTTGCCTTACTGGTTAGCAGAATGAATCACCGATACGCGAGCGAACGTGAAGCGACTGCTGCTGCAAAACGTCTGCGACCTGAGCAACAACATGAATGGTCTTCGGTTTCCGTGTTTCGTAAAGTCTGGAAACGCGGAAGTCAGCGCCCTGCACCATTATGTTCCGGATCTGCATCGCAGGATGCTGCTGGCTACCCTGTGGAACACCTACATCTGTATTAACGAAGCGCTGGCATTGACCCTGAGTGATTTTTCTCTGGTCCCGCCGCATCCATACCGCCAGTTGTTTACCCTCACAACGTTCCAGTAACCGGGCATGTTCATCATCAGTAACCCGTATCGTGAGCATCCTCTCTCGTTTCATCGGTATCATTACCCCCATGAACAGAAATCCCCCTTACACGGAGGCATCAGTGACCAAACAGGAAAAAACCGCCCTTAACATGGCCCGCTTTATCAGAAGCCAGACATTAACGCTTCTGGAGAAACTCAACGAGCTGGACGCGGATGAACAGGCAGACATCTGTGAATCGCTTCACGACCACGCTGATGAGCTTTACCGCAGCTGCCTCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCGCAGCCATGACCCAGTCACGTAGCGATAGCGGAGTGTATACTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTGCAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAACACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTCTTCAAGAA
b
diff -r 68693743661e -r 5b4ac70948d2 test-data/pBR322.gbk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pBR322.gbk Tue Mar 27 09:44:18 2018 -0400
b
b'@@ -0,0 +1,474 @@\n+LOCUS       SYNPBR322               4361 bp    DNA     circular SYN 30-SEP-2008\n+DEFINITION  Cloning vector pBR322, complete sequence.\n+ACCESSION   J01749 K00005 L08654 M10282 M10283 M10286 M10356 M10784 M10785\n+            M10786 M33694 V01119\n+VERSION     J01749.1\n+KEYWORDS    ampicillin resistance; beta-lactamase; cloning vector; drug\n+            resistance protein; origin of replication; plasmid; tetracycline\n+            resistance.\n+SOURCE      Cloning vector pBR322\n+  ORGANISM  Cloning vector pBR322\n+            other sequences; artificial sequences; vectors.\n+REFERENCE   1  (bases 1 to 3; 3259 to 4361)\n+  AUTHORS   Sutcliffe,J.G.\n+  TITLE     Nucleotide sequence of the ampicillin resistance gene of\n+            Escherichia coli plasmid pBR322\n+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 75 (8), 3737-3741 (1978)\n+   PUBMED   358200\n+REFERENCE   2  (bases 1 to 4361)\n+  AUTHORS   Sutcliffe,J.G.\n+  TITLE     Complete nucleotide sequence of the Escherichia coli plasmid pBR322\n+  JOURNAL   Cold Spring Harb. Symp. Quant. Biol. 43 (Pt 1), 77-90 (1979)\n+   PUBMED   383387\n+REFERENCE   3  (bases 1500 to 2300)\n+  AUTHORS   Reed,R.R., Young,R.A., Steitz,J.A., Grindley,N.D. and Guyer,M.S.\n+  TITLE     Transposition of the Escherichia coli insertion element gamma\n+            generates a five-base-pair repeat\n+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 76 (10), 4882-4886 (1979)\n+   PUBMED   388421\n+REFERENCE   4  (bases 2207 to 2265)\n+  AUTHORS   Covarrubias,L., Cervantes,L., Covarrubias,A., Soberon,X.,\n+            Vichido,I., Blanco,A., Kupersztoch-Portnoy,Y.M. and Bolivar,F.\n+  TITLE     Construction and characterization of new cloning vehicles. V.\n+            Mobilization and coding properties of pBR322 and several deletion\n+            derivatives including pBR327 and pBR328\n+  JOURNAL   Gene 13 (1), 25-35 (1981)\n+   PUBMED   6263753\n+REFERENCE   5  (bases 2000 to 2500)\n+  AUTHORS   Marians,K.J., Soeller,W. and Zipursky,S.L.\n+  TITLE     Maximal limits of the Escherichia coli replication factor Y\n+            effector site sequences in pBR322 DNA\n+  JOURNAL   J. Biol. Chem. 257 (10), 5656-5662 (1982)\n+   PUBMED   6279609\n+REFERENCE   6  (bases 1 to 80; 4151 to 4229; 4349 to 4361)\n+  AUTHORS   Brosius,J., Cate,R.L. and Perlmutter,A.P.\n+  TITLE     Precise location of two promoters for the beta-lactamase gene of\n+            pBR322. S1 mapping of ribonucleic acid isolated from Escherichia\n+            coli or synthesized in vitro\n+  JOURNAL   J. Biol. Chem. 257 (15), 9205-9210 (1982)\n+   PUBMED   6178738\n+REFERENCE   7  (bases 4241 to 4343)\n+  AUTHORS   Van Dyke,M.W., Hertzberg,R.P. and Dervan,P.B.\n+  TITLE     Map of distamycin, netropsin, and actinomycin binding sites on\n+            heterogeneous DNA: DNA cleavage-inhibition patterns with\n+            methidiumpropyl-EDTA.Fe(II)\n+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 79 (18), 5470-5474 (1982)\n+   PUBMED   6291045\n+REFERENCE   8  (bases 584 to 709)\n+  AUTHORS   Peden,K.W. and Nathans,D.\n+  TITLE     Local mutagenesis within deletion loops of DNA heteroduplexes\n+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 79 (23), 7214-7217 (1982)\n+   PUBMED   6760191\n+REFERENCE   9  (bases 373 to 649)\n+  AUTHORS   Peden,K.W.\n+  TITLE     Revised sequence of the tetracycline-resistance gene of pBR322\n+  JOURNAL   Gene 22 (2-3), 277-280 (1983)\n+   PUBMED   6307828\n+REFERENCE   10 (bases 132 to 181)\n+  AUTHORS   Watabe,H., Iino,T., Kaneko,T., Shibata,T. and Ando,T.\n+  TITLE     A new class of site-specific endodeoxyribonucleases. Endo.Sce I\n+            isolated from a eukaryote, Saccharomyces cerevisiae\n+  JOURNAL   J. Biol. Chem. 258 (8), 4663-4665 (1983)\n+   PUBMED   6300094\n+REFERENCE   11 (bases 368 to 581)\n+  AUTHORS   Livneh,Z.\n+  TITLE     Directed mutagenesis method for analysis of mutagen specificity:\n+            application to ultraviolet-induced mutagenesis\n+  JOURNAL   Proc. Natl. Acad. Sci. U.S.A. 80 (1), 237-241 (1983)\n+   PUBMED   6337373\n+REFERENCE   12 (base'..b'tggagccg\n+     1261 ggccacctcg acctgaatgg aagccggcgg cacctcgcta acggattcac cactccaaga\n+     1321 attggagcca atcaattctt gcggagaact gtgaatgcgc aaaccaaccc ttggcagaac\n+     1381 atatccatcg cgtccgccat ctccagcagc cgcacgcggc gcatctcggg cagcgttggg\n+     1441 tcctggccac gggtgcgcat gatcgtgctc ctgtcgttga ggacccggct aggctggcgg\n+     1501 ggttgcctta ctggttagca gaatgaatca ccgatacgcg agcgaacgtg aagcgactgc\n+     1561 tgctgcaaaa cgtctgcgac ctgagcaaca acatgaatgg tcttcggttt ccgtgtttcg\n+     1621 taaagtctgg aaacgcggaa gtcagcgccc tgcaccatta tgttccggat ctgcatcgca\n+     1681 ggatgctgct ggctaccctg tggaacacct acatctgtat taacgaagcg ctggcattga\n+     1741 ccctgagtga tttttctctg gtcccgccgc atccataccg ccagttgttt accctcacaa\n+     1801 cgttccagta accgggcatg ttcatcatca gtaacccgta tcgtgagcat cctctctcgt\n+     1861 ttcatcggta tcattacccc catgaacaga aatccccctt acacggaggc atcagtgacc\n+     1921 aaacaggaaa aaaccgccct taacatggcc cgctttatca gaagccagac attaacgctt\n+     1981 ctggagaaac tcaacgagct ggacgcggat gaacaggcag acatctgtga atcgcttcac\n+     2041 gaccacgctg atgagcttta ccgcagctgc ctcgcgcgtt tcggtgatga cggtgaaaac\n+     2101 ctctgacaca tgcagctccc ggagacggtc acagcttgtc tgtaagcgga tgccgggagc\n+     2161 agacaagccc gtcagggcgc gtcagcgggt gttggcgggt gtcggggcgc agccatgacc\n+     2221 cagtcacgta gcgatagcgg agtgtatact ggcttaacta tgcggcatca gagcagattg\n+     2281 tactgagagt gcaccatatg cggtgtgaaa taccgcacag atgcgtaagg agaaaatacc\n+     2341 gcatcaggcg ctcttccgct tcctcgctca ctgactcgct gcgctcggtc gttcggctgc\n+     2401 ggcgagcggt atcagctcac tcaaaggcgg taatacggtt atccacagaa tcaggggata\n+     2461 acgcaggaaa gaacatgtga gcaaaaggcc agcaaaaggc caggaaccgt aaaaaggccg\n+     2521 cgttgctggc gtttttccat aggctccgcc cccctgacga gcatcacaaa aatcgacgct\n+     2581 caagtcagag gtggcgaaac ccgacaggac tataaagata ccaggcgttt ccccctggaa\n+     2641 gctccctcgt gcgctctcct gttccgaccc tgccgcttac cggatacctg tccgcctttc\n+     2701 tcccttcggg aagcgtggcg ctttctcata gctcacgctg taggtatctc agttcggtgt\n+     2761 aggtcgttcg ctccaagctg ggctgtgtgc acgaaccccc cgttcagccc gaccgctgcg\n+     2821 ccttatccgg taactatcgt cttgagtcca acccggtaag acacgactta tcgccactgg\n+     2881 cagcagccac tggtaacagg attagcagag cgaggtatgt aggcggtgct acagagttct\n+     2941 tgaagtggtg gcctaactac ggctacacta gaaggacagt atttggtatc tgcgctctgc\n+     3001 tgaagccagt taccttcgga aaaagagttg gtagctcttg atccggcaaa caaaccaccg\n+     3061 ctggtagcgg tggttttttt gtttgcaagc agcagattac gcgcagaaaa aaaggatctc\n+     3121 aagaagatcc tttgatcttt tctacggggt ctgacgctca gtggaacgaa aactcacgtt\n+     3181 aagggatttt ggtcatgaga ttatcaaaaa ggatcttcac ctagatcctt ttaaattaaa\n+     3241 aatgaagttt taaatcaatc taaagtatat atgagtaaac ttggtctgac agttaccaat\n+     3301 gcttaatcag tgaggcacct atctcagcga tctgtctatt tcgttcatcc atagttgcct\n+     3361 gactccccgt cgtgtagata actacgatac gggagggctt accatctggc cccagtgctg\n+     3421 caatgatacc gcgagaccca cgctcaccgg ctccagattt atcagcaata aaccagccag\n+     3481 ccggaagggc cgagcgcaga agtggtcctg caactttatc cgcctccatc cagtctatta\n+     3541 attgttgccg ggaagctaga gtaagtagtt cgccagttaa tagtttgcgc aacgttgttg\n+     3601 ccattgctgc aggcatcgtg gtgtcacgct cgtcgtttgg tatggcttca ttcagctccg\n+     3661 gttcccaacg atcaaggcga gttacatgat cccccatgtt gtgcaaaaaa gcggttagct\n+     3721 ccttcggtcc tccgatcgtt gtcagaagta agttggccgc agtgttatca ctcatggtta\n+     3781 tggcagcact gcataattct cttactgtca tgccatccgt aagatgcttt tctgtgactg\n+     3841 gtgagtactc aaccaagtca ttctgagaat agtgtatgcg gcgaccgagt tgctcttgcc\n+     3901 cggcgtcaac acgggataat accgcgccac atagcagaac tttaaaagtg ctcatcattg\n+     3961 gaaaacgttc ttcggggcga aaactctcaa ggatcttacc gctgttgaga tccagttcga\n+     4021 tgtaacccac tcgtgcaccc aactgatctt cagcatcttt tactttcacc agcgtttctg\n+     4081 ggtgagcaaa aacaggaagg caaaatgccg caaaaaaggg aataagggcg acacggaaat\n+     4141 gttgaatact catactcttc ctttttcaat attattgaag catttatcag ggttattgtc\n+     4201 tcatgagcgg atacatattt gaatgtattt agaaaaataa acaaataggg gttccgcgca\n+     4261 catttccccg aaaagtgcca cctgacgtct aagaaaccat tattatcatg acattaacct\n+     4321 ataaaaatag gcgtatcacg aggccctttc gtcttcaaga a\n+//\n+\n'