view snpEff_create_db.xml @ 27:9473cd297a76 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit af2e922183065d2a950e07b4027cc6600503d38a
author iuc
date Wed, 22 Nov 2023 19:44:16 +0000
parents 5c7b70713fb5
children
line wrap: on
line source

<tool id="snpEff_build_gb" name="SnpEff build:" version="@WRAPPER_VERSION@.galaxy6" profile="22.01">
    <description> database from Genbank or GFF record</description>
    <macros>
        <import>snpEff_macros.xml</import>
    </macros>
    <requirements>
        <expand macro="requirement" />
        <requirement type="package" version="1.79">biopython</requirement>
    </requirements>
    <expand macro="stdio" />
    <expand macro="version_command" />
    <command><![CDATA[
        #if str($input_type.input_type_selector) == "gb" and str($input_type.fasta) == "yes":
            python3 '$__tool_directory__/gbk2fa.py' '${input_type.input}' '${output_fasta}' ${input_type.remove_version} &&
        #end if

        mkdir -p '${snpeff_output.files_path}/${genome_version}' &&
        mkdir -p snpeff_output/'${genome_version}' &&

        #if str($input_type.input_type_selector) == "gb":
            #if $input_type.input.is_of_type("genbank"):
                ln -s '${input_type.input}' 'snpeff_output/${genome_version}/genes.gbk' &&
            #elif $input_type.input.is_of_type("genbank.gz"):
                ln -s '${input_type.input}' 'snpeff_output/${genome_version}/genes.gbk.gz' &&
            #end if
        #else:
            #if $input_type.reference_source.reference_source_selector == "history":
                #if $input_type.reference_source.input_fasta.is_of_type("fasta"):
                    ln -s '${input_type.reference_source.input_fasta}' 'snpeff_output/${genome_version}/sequences.fa' &&
                #elif $input_type.reference_source.input_fasta.is_of_type("fasta.gz"):
                    ln -s '${input_type.reference_source.input_fasta}' 'snpeff_output/${genome_version}/sequences.fa.gz' &&
                #end if
            #elif $input_type.reference_source.reference_source_selector == "cached":
                ln -s '${input_type.reference_source.ref_file.fields.path}' 'snpeff_output/${genome_version}/sequences.fa' &&
            #end if
            ln -s '${input_type.input}' 'snpeff_output/${genome_version}/genes.${input_type.input_type_selector}' &&
        #end if

        snpEff @JAVA_OPTIONS@ build -v
        -configOption '${genome_version}'.genome='${genome_version}'
        -configOption '${genome_version}'.codonTable='${codon_table}'
        #if str($input_type.input_type_selector) == "gb":
            -genbank
        #elif str($input_type.input_type_selector) == "gff":
            -gff3
        #elif str($input_type.input_type_selector) == "gtf":
            -gtf22
        #end if
        -dataDir "\$(pwd)/snpeff_output" '${genome_version}' &&
        mv snpeff_output/'${genome_version}'/*.bin '${snpeff_output.files_path}/${genome_version}' &&
        echo '${genome_version}.genome : ${genome_version}' >> '${snpeff_output.files_path}'/snpEff.config &&
        echo '${genome_version}.codonTable : ${codon_table}' >> '${snpeff_output.files_path}'/snpEff.config
    ]]></command>
    <inputs>
        <param name="genome_version" type="text" value="" label="Name for the database" help="For E. coli K12, for example, you may want to use 'EcK12'. Note: Spaces are not allowed in the name and will get converted to underscores.">
            <sanitizer>
                <valid initial="string.ascii_letters,string.digits">
                    <add value="_" />
                    <add value="." />
                    <add value="-" />
                </valid>
                <mapping>
                    <add source=" " target="_" />
                </mapping>
            </sanitizer>
            <validator type="empty_field" message="A genome version name is required" />
        </param>
        <conditional name="input_type">
            <param name="input_type_selector" type="select" display="radio" label="Input annotations are in" help="Specify format for annotations you are using to create SnpEff database">
                <option value="gb" selected="true">GenBank</option>
                <option value="gff">GFF</option>
                <option value="gtf">GTF</option>
            </param>
            <when value="gb">
                <param name="input" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/>
                <param name="fasta" type="select" display="radio" label="Parse Genbank into Fasta" help="This will generate an additional dataset containing all sequences from Genbank file in FASTA format">
                        <option value="yes" selected="true">Yes</option>
                        <option value="no">No</option>
                </param>
                <param argument="--remove_version" type="boolean" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have version numbers such as B000564.2. This option removes them leaving only B000564" />
            </when>
            <when value="gff">
                <param name="input" type="data" format="gff3" label="GFF dataset to build database from" help="This GFF file will be used to generate snpEff database"/>
                <expand macro="ref_select" />
            </when>
            <when value="gtf">
                <param name="input" type="data" format="gtf" label="GTF dataset to build database from" help="This GTF file will be used to generate snpEff database"/>
                <expand macro="ref_select" />
            </when>
        </conditional>
        <param name="codon_table" type="select" label="Select genetic code for this sequence" help="If this sequence uses non-standard genetic code, select one from these options">
            <option selected="true" value="Standard">Standard</option>
            <option value="Vertebrate_Mitochondrial">Vertebrate_Mitochondrial</option>
            <option value="Yeast_Mitochondrial">Yeast_Mitochondrial</option>
            <option value="Mold_Mitochondrial">Mold_Mitochondrial</option>
            <option value="Protozoan_Mitochondrial">Protozoan_Mitochondrial</option>
            <option value="Coelenterate">Coelenterate</option>
            <option value="Mitochondrial">Mitochondrial</option>
            <option value="Mycoplasma">Mycoplasma</option>
            <option value="Spiroplasma">Spiroplasma</option>
            <option value="Invertebrate_Mitochondrial">Invertebrate_Mitochondrial</option>
            <option value="Ciliate_Nuclear">Ciliate_Nuclear</option>
            <option value="Dasycladacean_Nuclear">Dasycladacean_Nuclear</option>
            <option value="Hexamita_Nuclear">Hexamita_Nuclear</option>
            <option value="Echinoderm_Mitochondrial">Echinoderm_Mitochondrial</option>
            <option value="Flatworm_Mitochondrial">Flatworm_Mitochondrial</option>
            <option value="Euplotid_Nuclear">Euplotid_Nuclear</option>
            <option value="Bacterial_and_Plant_Plastid">Bacterial_and_Plant_Plastid</option>
            <option value="Alternative_Yeast_Nuclear">Alternative_Yeast_Nuclear</option>
            <option value="Ascidian_Mitochondrial">Ascidian_Mitochondrial</option>
            <option value="Alternative_Flatworm_Mitochondrial">Alternative_Flatworm_Mitochondrial</option>
            <option value="Blepharisma_Macronuclear">Blepharisma_Macronuclear</option>
            <option value="Chlorophycean_Mitochondrial">Chlorophycean_Mitochondrial</option>
            <option value="Trematode_Mitochondrial">Trematode_Mitochondrial</option>
            <option value="Scenedesmus_obliquus_Mitochondrial">Scenedesmus_obliquus_Mitochondrial</option>
            <option value="Thraustochytrium_Mitochondrial">Thraustochytrium_Mitochondrial</option>
        </param>
    </inputs>
    <outputs>
        <data name="snpeff_output" format="snpeffdb" label="@SNPEFF_VERSION@ database for ${genome_version}"/>
        <data name="output_fasta" format="fasta" label="Fasta sequences for ${genome_version}">
            <filter>input_type['input_type_selector'] == 'gb'</filter>
            <filter>input_type['fasta'] == 'yes'</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="genome_version" value="pBR322"/>
            <param name="input_type_selector" value="gb"/>
            <param name="input" value="pBR322.gbk" />
            <output name="snpeff_output">
                <assert_contents>
                    <has_text text="pBR322" />
                </assert_contents>
            </output>
            <output name="output_fasta" value="pBR322_test1.fna"/>
        </test>
        <test>
            <param name="genome_version" value="pBR322"/>
            <param name="input_type_selector" value="gb"/>
            <param name="input" value="pBR322.gbk.gz" />
            <output name="snpeff_output">
                <assert_contents>
                    <has_text text="pBR322" />
                </assert_contents>
            </output>
            <output name="output_fasta" value="pBR322_test1.fna"/>
        </test>
        <test>
            <param name="genome_version" value="pBR322"/>
            <param name="input_type_selector" value="gff"/>
            <param name="reference_source_selector" value="history"/>
            <param name="input_fasta" value="pBR322_test2.fna" />
            <param name="input" value="pBR322.gff3"/>
            <output name="snpeff_output">
                <assert_contents>
                    <has_text text="pBR322" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="genome_version" value="pBR322"/>
            <param name="input_type_selector" value="gff"/>
            <param name="reference_source_selector" value="history"/>
            <param name="input_fasta" value="pBR322_test2.fna.gz" />
            <param name="input" value="pBR322.gff3"/>
            <output name="snpeff_output">
                <assert_contents>
                    <has_text text="pBR322" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="genome_version" value="Saccharomyces_mito"/>
            <param name="input_type_selector" value="gtf"/>
            <param name="reference_source_selector" value="history"/>
            <param name="input_fasta" value="Saccharomyces_mito.fa.gz" />
            <param name="input" value="Saccharomyces_mito.gtf" />
            <output name="snpeff_output">
                <assert_contents>
                    <has_text text="Saccharomyces_mito" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

This tool uses `"snpEff build -genbank"` or `"snpEff build -gff3"` commands to create a snpEff database.

------

.. class:: infomark

**Working with Genbank files**

Using Genbank data for creating databases has several advantages:

 #. Genbank files contain annotations (such as locations of genes) together with sequences. This ensures that these two are in sync with each other.
 #. When you are analyzing small genomes (or not so small) it is much more convenient to create a database on the fly and use it.

 .. class:: warningmark

 SnpEff errors out on highly fragmented genomes containing multiple scaffolds. This is because a single gene may be split between multiple scaffolds causing SnpEff to crash. If this is happening use GFF route described below.

-------

**Genbank usage scenario**

Suppose you have a series of Illumina reads from an experiment involving *E. coli* K-12 MG1655. You want to map these reads to the reference genome of K-12 MG1655, call variants, and annotate them using snpEff. This tool enables you to follow the following analysis steps:

 #. Go to `NCBI <http://www.ncbi.nlm.nih.gov>`_ page for K-12 MG1655 genome (note that all NCBI genomes have similar list of files associated with them).
 #. Copy URL for file with extension `gbff.gz`
 #. Paste the URL into upload tool and set datatype to `genbank.gz`.
 #. Use this tool to generate a snpEff database and FASTA sequences from the dataset you've uploaded during the previous step.
 #. Use your Illumina reads to map against FASTA dataset generated in the previous step using BWA-MEM.
 #. Call variants using **Freebayes**.
 #. Annotate vcf output of Freebayes with **SnpEff eff** using database generated at step 2 (using *Custom* option for **Genome source** parameter).

In this scenario Genbank dataset is used twice. First, it is used to produce FASTA sequences that are using by BWA to map against. Second, it is used to create snpEff database. This guarantees that you will not have any issues related to reference sequence naming.

-------

.. class:: infomark

**Working with GFF files**

Alternatively you can create a SnpEff database from `GFF3 <https://en.wikipedia.org/wiki/General_feature_format>`_ files downloaded from NCBI or any other source. Using GFF dataset for building SnpEff database requires two inputs:

 #. The GFF file itself
 #. A genome in FASTA format

The GFF file contains coordinates of various features, but does not contain underlying sequences. This is why a FASTA file needs to be provided as well.

------

**GFF usage scenario**

The following example also uses *E. coli* K-12 MG1655:

#. Go to `NCBI <http://www.ncbi.nlm.nih.gov>`_ page for K-12 MG1655 genome.
#. Copy URLs for files with `gff.gz` and `fna.gz` extensions. The first file contains annotations in GFF3 format. The second file contains entire genome as a FASTA record.
#. Paste URLs into upload tool and set datatypes to `gff3` and `fasta.gz` for annotations and genome, respectively.
#. Use this tool to generate a snpEff database from the GFF dataset.
#. Map your reads against the FASTA dataset and continue as described in the above example.


@SNPEFF_IN_GALAXY_INFO@
@EXTERNAL_DOCUMENTATION@
]]>
    </help>
    <expand macro="citations" />
</tool>