view antismash.xml @ 4:e78e25d3b4bd draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/antismash commit f5f8e44e726c9f2cc57e0f0fe8182a73afa56669
author bgruening
date Tue, 31 May 2022 14:04:07 +0000
parents 5784e268efca
children bc88856eddab
line wrap: on
line source

<tool id="antismash" name="Antismash" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
    <description>allows the genome-wide identification, annotation and analysis of secondary metabolite biosynthesis gene clusters</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro='requirements'/>
    <expand macro="bio_tools"/>
    <version_command>antismash --version</version_command>
    <command detect_errors="aggressive">
<![CDATA[
        export PYTHONWARNINGS="ignore::FutureWarning" &&

        #import os, glob
        #set $htmloutputfolder = $html.files_path
        #if str($infile.ext) == 'genbank':
            #set $file_extension = 'gb'
        #else:
            #set $file_extension = $infile.ext
        #end if

        ln -s '$infile' input_tempfile.$file_extension &&
        #if $genefinding_gff3
            ln -s $genefinding_gff3 annotation.gff3 &&
        #end if


        ## create html folder
        mkdir -p '$htmloutputfolder' &&

        antismash
            --cpus "\${GALAXY_SLOTS:-12}"
            --taxon '${cond_taxon.taxon}'
            #if $genefinding_gff3
                --genefinding-gff3 annotation.gff3
            #end if
            --genefinding-tool $cond_taxon.genefinding_tool

            ${cb_general}
            ${cb_subclusters}
            ${cb_knownclusters}
            ${smcog_trees}
            --tta-threshold ${tta_threshold}
            ${asf}

            ${clusterhmmer}
            ${fullhmmer}
            #if $cond_taxon.taxon == 'fungi':
                $cond_taxon.cassis
            #else   
                $cond_taxon.tigrfam
            #end if

            ${cc_mibig}
            ${rre}
            --logfile $log

            ## Advanced options
            --minlength $advanced_options.minlength
            --hmmdetection-strictness $advanced_options.hmmdetection_strictness
            --cb-nclusters $advanced_options.cb_nclusters
            --cb-min-homology-scale $advanced_options.cb_min_homology_scale
            --rre-cutoff $advanced_options.rre_cutoff
            --rre-minlength $advanced_options.rre_minlength

            input_tempfile.$file_extension &&

        ## copy all content to html folder
        cp input_tempfile/index.html '${html}' 2> /dev/null &&
        cp -r input_tempfile/* '${htmloutputfolder}'
]]>
    </command>
    <inputs>
        <param name="infile" type="data" format="genbank,fasta,embl" label="Sequence file in GenBank,EMBL or FASTA format"/>
        <param argument="--genefinding-gff3" type="data" format="gff3" optional="true" label="GFF3 file" help="Specify GFF3 file to extract features from" />

        <conditional name="cond_taxon">
            <param argument="--taxon" type="select" label="Taxonomic classification of input sequence" help="Source of DNA">
                <option value="bacteria" selected="True">Bacteria</option>
                <option value="fungi">Fungi</option>
            </param>
            <when value="bacteria">
                <expand macro="genefinding">
                    <option value="prodigal" selected="True">Prodigal</option>
                    <option value="prodigal-m">Prodigal Metagenomic/Anonymous</option>
                </expand>
                <param argument="--tigrfam" type="boolean" truevalue="--tigrfam" falsevalue="" checked="false" 
                    label="Annotate with TIGRFam" help="Annotate clusters using TIGRFam profiles. TIGRFAMs is a 
                    collection of manually curated protein families focusing primarily on prokaryotic sequences" />
            </when>
            <when value="fungi">
                <expand macro="genefinding"/>
                <param argument="--cassis" type="boolean" truevalue="--cassis" falsevalue="" checked="false"
                       label="Motif based prediction of SM gene cluster regions" help="Improved prediction of gene cluster borders for fungal BGCs (CASSIS)"/>
            </when>
        </conditional>
        <param argument="--fullhmmer" type="boolean" truevalue="--fullhmmer" falsevalue="" checked="false"
            label="Full genome PFAM anotation"  help="Each gene product encoded in the detected BGCs is analyzed against the PFAM database. 
                Hits are annotated in the final Genbank/EMBL files. Also, selecting this option normally increases the runtime"/>
        
        <param argument="--clusterhmmer" type="boolean" truevalue="--clusterhmmer" falsevalue="" checked="false"
            label="PFAM anotation for only clusters" help="Run a cluster-limited HMMer analysis" />

        <param argument="--asf" type="boolean" truevalue="--asf" falsevalue="" checked="True"
            label="Run active site finder analysis" help="Active sites of several highly conserved biosynthetic enzymes are detected and variations of the active sites are reported"/>

        <param argument="--cc-mibig" type="boolean" truevalue="--cc-mibig" falsevalue="" checked="false" label="Comparison against MIBiG database" help="Run a comparison against the MIBiG database" />

        <param argument="--cb-general" type="boolean" truevalue="--cb-general" falsevalue="" checked="false"
            label="BLAST identified clusters against known clusters"
            help="Compare identified clusters against a database of antiSMASH-predicted clusters." />

        <param argument="--cb-knownclusters" type="boolean" truevalue="--cb-knownclusters" falsevalue="" checked="true"
            label="KnowCluster BLAST analysis"
            help="Compare identified clusters against known gene clusters from the MIBiG database. MIBiG is a hand curated data collection of biosynthetic 
                gene clusters, which have been experimentally characterized"/>

        <param argument="--cb-subclusters" type="boolean" truevalue="--cb-subclusters" falsevalue="" checked="true"
            label="Subcluster BLAST analysis"
            help="The identified clusters are searched against a database containing operons involved in the biosynthesis of common secondary metabolite building 
                blocks (e.g. the biosynthesis of non-proteinogenic amino acids)" />

        <param argument="--pfam2go" type="boolean" truevalue="--pfam2go" falsevalue="" checked="true"
            label="Run Pfam to Gene Ontology mapping module" />

        <param argument="--rre" type="boolean" truevalue="--rre" falsevalue="" checked="true" label="RREFinder precision mode" help="Run RREFinder precision mode on all RiPP gene clusters. Many ribosomally 
            synthesized and posttranslationally modified peptide classes (RiPPs) are reliant on a domain called the RiPP recognition element (RRE). The RRE binds specifically to a precursor peptide and directs 
            the posttranslational modification enzymes to their substrates" />

        <param argument="--smcog-trees" type="boolean" checked="True" truevalue="--smcog-trees" falsevalue=""
            label="Analysis of secondary metabolism gene families (smCOGs)"
            help="It attempts to allocate each gene in the detected gene clusters to a secondary metabolism-specific gene family using profile hidden Markov models specific for 
                the conserved sequence region characteristic of this family. In other words, each gene of the cluster is compared to a database of clusters of orthologous groups 
                of proteins involved in secondary metabolism"/>

        <param argument="--tta-threshold" type="float" value="0.65" label="Lowest GC content to annotate TTA codons at" 
            help="High-GC containing bacterial sequences contain the rare Leu-codon “TTA” as a mean for post-transcriptional regulation by limiting/controlling the amount of TTA-tNRA in the cell. 
                This type of regulation is commonly found in secondary metabolite BGCs. This feature will annotate such TTA codons in the identified BGCs. Default: 0.65"/>
        <section name="advanced_options" title="Advanced options">
            <param argument="--minlength" type="integer" min="0" value="1000" label="Min length" help="Only process sequences larger than this value. Default: 1000" />
            <param argument="--hmmdetection-strictness" type="select" label="HMM detection strictness" help="Defines which level of strictness to use for HMM-based cluster detection. Default: relaxed">
                <option value="strict">Strict</option>
                <option value="relaxed" selected="true">Relaxed</option>
                <option value="loose">Loose</option>
            </param>
            <param argument="--cb-nclusters" type="integer" min="0" max="50" value="10" label="Number of clusters from ClusterBlast to display" help="Default: 10" />
            <param argument="--cb-min-homology-scale" type="float" min="0" max="1" value="0" label="ClusterBlast minimum scaling factor" help="A minimum scaling factor 
                for the query BGC in ClusterBlast results. Default: 0" />
            <param argument="--rre-cutoff" type="float" min="0" max="100" value="25" label="RRE cutoff" help="Bitscore cutoff for RRE pHMM detection. Default: 25.0" />
            <param argument="--rre-minlength" type="integer" min="0" max="100" value="50" label="RRE minlength" help="Minimum amino acid length of RRE domains. Default: 50" />
        </section>

        <param name="outputs" type="select" multiple="true" label="Outputs">
            <option value="html" selected="True">HTML file</option>
            <option value="all">All results</option>
            <option value="embl">EMBL files</option>
            <option value="gb">GenBank files</option>
            <option value="genecluster_tabular">Gene clusters</option>
            <option value="log">Log file</option>
        </param>

    </inputs>
    <outputs>
        <collection type="list" name="genecluster_tabular" label="${tool.name} on ${on_string}: Gene Cluster">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.txt" directory="input_tempfile" ext="txt" visible="false" />
            <filter>'genecluster_tabular' in outputs</filter>
        </collection>
        <collection name="genbank" type="list" label="${tool.name} on ${on_string}: GenBank">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" directory="input_tempfile" ext="genbank" visible="false" />
            <filter>'gb' in outputs or fullhmmer</filter>
        </collection>
        <collection name="embl" type="list" label="${tool.name} on ${on_string}: EMBL">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" directory="input_tempfile" ext="embl" visible="false" />
            <filter>'embl' in outputs</filter>
        </collection>
        <collection name="archive" type="list" label="${tool.name} on ${on_string}: all files compressed">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.zip" directory="input_tempfile" ext="zip" visible="false" />
            <filter>'all' in outputs</filter>
        </collection>
        <data format="html" name="html" label="${tool.name} on ${on_string}: HTML report" />
        <data format="txt" name="log" label="${tool.name} on ${on_string}: log file">
            <filter>'log' in outputs</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <param name="infile" value="sequence.fasta"/>
            <output name="html" file="index.html"/>
        </test>
        <test expect_num_outputs="2">
            <param name="infile" value="sequence.gb"/>
            <param name="outputs" value="html,gb"/>
            <param name="taxon" value="fungi"/>
            <param name="clusterhmmer" value="true"/>
            <param name="fullhmmer" value="true"/>
            <param name="cassis" value="true"/>
            <param name="cb_general" value="true"/>
            <output_collection name="genbank" type="list">
                <element name="input_tempfile" file="test_02.genbank" ftype="genbank" lines_diff="2"/>
            </output_collection>
            <output name="html" file="index.2.html" ftype="html">
                <assert_contents>
                    <has_text text="No results found on input"/>
                </assert_contents>
            </output>
        </test>

    <test expect_num_outputs="3">
        <param name="infile" value="sequence_long.fasta"/>
        <param name="genefinding_gff3" value="annotation.gff3"/>
        <param name="fullhmmer" value="true"/>
        <param name="cc_mibig" value="true"/>
        <param name="pfam2go" value="true"/>
        <param name="rre" value="true"/>
        <param name="outputs" value="html,gb,log"/>
        <section name="advanced_options">
            <param name="minlength" value="1000"/>
            <param name="hmmdetection_strictness" value="strict"/>
            <param name="cb_nclusters" value="10"/>
            <param name="cb_min_homology_scale" value="0.1"/>
            <param name="rre_cutoff" value="10"/>
            <param name="rre_minlength" value="50"/>
        </section>
        <output_collection name="genbank" type="list">
            <element name="input_tempfile" file="test_03.genbank" ftype="genbank" lines_diff="2"/>
        </output_collection>
        <output name="html" file="index.3.html" ftype="html">
            <assert_contents>
                <has_text text="No results found on input"/>
            </assert_contents>
        </output>
        <output name="log">
            <assert_contents>
                <has_text text="antiSMASH status: SUCCESS"/>
                <has_text text="HMM detection using strictness: strict"/>
            </assert_contents>
        </output>
    </test>
 
    </tests>
    <help>
<![CDATA[

**What it does**

AntiSMASH allows the rapid genome-wide identification, annotation and analysis of secondary metabolite biosynthesis gene clusters in bacterial and fungal genomes.
It integrates and cross-links with a large number of in silico secondary metabolite analysis tools that have been published earlier.

antiSMASH is powered by several open source tools: NCBI BLAST+, HMMer 3, Muscle 3, Glimmer 3, FastTree, TreeGraph 2, Indigo-depict, PySVG and JQuery SVG.

**Input**

The ideal input for antiSMASH is an annotated nucleotide file in Genbank format or EMBL format.
You can either upload a GenBank/EMBL file manually, or simply enter the GenBank/RefSeq accession number of your sequence for antiSMASH to upload it.
If no annotation is available, we recommend running your sequence through an annotation pipeline like RAST to obtain GBK/EMBL files with high-quality annotations.

Alternatively, you can provide a FASTA file containing a single sequence. antiSMASH will generate a preliminary annotation using Prodigal, and use that to run the rest of the analysis.
You can also provide gene annotations in GFF3 foramt. Input files should be properly formatted.
If you are creating your GBK/EMBL/FASTA file manually, be sure to do so in a plain text editor like Notepad or Emacs, and saving your files as "All files (.)", ending with the correct extension (for example ".fasta", ".gbk", or ".embl".

There are several optional analyses that may or may not be run on your sequence. Highly recommended is the Gene Cluster Blast Comparative Analysis, which runs BlastP using each amino acid sequence from a detected gene cluster as a query on a large database of predicted protein sequences from secondary metabolite biosynthetic gene clusters, and pools the results to identify the gene clusters that are most homologous to the gene cluster that was detected in your query nucleotide sequence.
This analysis is selected by default

Also available is the analysis of secondary metabolism gene families (smCOGs). This analysis attempts to allocate each gene in the detected gene clusters to a secondary metabolism-specific gene family using profile hidden Markov models specific for the conserved sequence region characteristic of this family.
Additionally, a phylogenetic tree is constructed of each gene together with the (max. 100) sequences of the smCOG seed alignment. This analysis is selected by default

**Ouput**

The output of the antiSMASH analysis pipeline is organized in an interactive HTML page with SVG graphics, and different parts of the analysis are displayed in different panels for every gene cluster

In the upper right, a small list of buttons offers further functionality. The house-shaped button will get you back on the antiSMASH start page.
The question-mark button will get you to this help page. The exclamation-mark button leads to a page explaining about antiSMASH.
The downward-pointing arrow will open a menu offering to download the complete set of results from the antiSMASH run, a summary Excel file and to the summary EMBL/GenBank output file.
The EMBL/GenBank file can be viewed in a genome browser such as Artemis.

]]>
    </help>
    <expand macro="citations" />
</tool>