view bigscape.xml @ 0:a9e5d237d7d4 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bigscape/ commit 1c7a35c3aabb33682b263cb3a8dbeaf605469c23
author iuc
date Sun, 25 Feb 2024 10:51:27 +0000
parents
children 353b2de0eabf
line wrap: on
line source

<tool id="bigscape" name="BiG-SCAPE" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
    <description>Construct sequence similarity networks of BGCs and group them into GCF</description>
    <macros>
        <token name="@TOOL_VERSION@">1.1.9</token>
        <token name="@VERSION_SUFFIX@">0</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">bigscape</requirement>
    </requirements>
    <command detect_errors="exit_code">
    <![CDATA[

        #set $path_to_html = $html.files_path
        mkdir -p '$path_to_html' result input &&
        #for $files in $inputdir:
            #set $filename = "region." + $files.element_identifier
            ln -s '$files' './input/$filename' &&
        #end for

        mkdir pfam && 
        ln -s '$pfam_dir' './pfam/$pfam_dir.element_identifier' &&
        hmmpress './pfam/Pfam-A.hmm' &&

        #if $anchor.is_select == "yes":
            ln -s '$anchorfile' '$anchorfile.element_identifier' &&
        #end if
        #if $list.is_select == "yes":
            cat '$__tool_directory__/domain_includelist.txt' > save.txt &&
            cat '$domain_includelist' > '$__tool_directory__/domain_includelist.txt' &&
        #end if

        bigscape
        --inputdir input
        #if $mibig.is_select == "yes"
            $mibig.mibig.value
        #end if
        --outputdir result
        #if $use_label.is_select == "yes":
            --label '${label}'
        #end if
        --pfam_dir pfam
        --cores \${GALAXY_SLOTS:-8}
        ${verbose}
        ${log}
        ${include_singletons}
        --domain_overlap_cutoff ${domain_overlap_cutoff}
        --min_bgc_size ${min_big_size}
        ${mix}
        ${no_classify}
        #if $banned_classes.value: 
            --banned_classes
            #for $banned in str($banned_classes).split( "," ):  
                '$banned'
            #end for
        #end if
        --cutoffs #for $c in $cutoff# ${c.cutoffs} #end for#
        ${clans_off}
        #if $clan_cutoff.is_select == "yes":
            --clan_cutoff $clan_cutoff_val1 $clan_cutoff_val2
        #end if
        ${hybrids_off}
        --mode ${mode.value}
        #if $anchor.is_select == "yes":
            --anchorfile '${anchorfile.element_identifier}'
        #end if
        ${force_hmmscan}
        #if $list.is_select == "yes":
            --domain_includelist
        #end if
        &&

        cp './result/index.html' '$html' &&
        cp -r './result/html_content' '$path_to_html'

        #if $list.is_select == "yes":
            && cat save.txt > '$__tool_directory__/domain_includelist.txt'
        #end if

        #if $log:
            && cp log.txt '$logfile'
        #end if
    ]]>
    </command>
    <inputs>
        <param argument="--inputdir" format="genbank" multiple="true" type="data"
            label="Data files to include in the clustering" 
            help="Add your .gbk files here. Do not wonder, in the filename 'region. will be added to ensure that every file will be included!" />
        <conditional name="mibig">
            <param name="is_select" type="select" label="Include BGCs from MIBiG database?"
                help="Select yes and select which version of the database you want to use" >
                    <option value="yes" selected="False">Yes</option>
                    <option value="no" selected="True">No</option>
            </param>
            <when value="yes">
                <param name="mibig" type="select" optional="false" label="Version from the MIBiG database"
                    help="Select which version of the MIBiG databse you want to use." >
                        <option value="--mibig">3.1</option>
                        <option value="--mibig21">2.1</option>
                        <option value="--mibig14">1.4</option>
                        <option value="--mibig13">1.3</option>
                </param>
            </when>
            <when value="no" />
        </conditional>
        <conditional name="use_label" >
            <param name="is_select" type="select" label="Adding extra string to BiG-SCAPE runs?"
                help="Select yes if you want to add an extra string to the outputs">
                <option value="yes" selected="False">Yes</option>
                <option value="no" selected="True">No</option>
            </param>
            <when value="yes">
                <param argument="--label" type="text" value="" optional="false" label="Adding extra string to BiG-SCAPE runs name"
                    help="By default the BiG-SCAPE runs are named as (YYYY-MM-DD_HH-MM-SS_[extra]) where extra means the mode and if activated (hybrids).">
                    <validator type="empty_field" />
                </param>
            </when>
            <when value="no" />
        </conditional>
        <param argument="--pfam_dir" format="hmm3" type="data"
            label="Data file Pfam-A.hmm"
            help="Add Pfam-A.hmm.gz file here, please. Look at the help section where you can download it!" />
        <param argument="--verbose" type="boolean" truevalue="--verbose" falsevalue="" checked="False"
            label="Print more detailed information about each step"
            help="Getting more information about the analysis if toggled is true." />
        <param name="log" type="boolean" truevalue="> log.txt" falsevalue="" checked="False"
            label="Extra log file as output"
            help="When using you will receive a log file for the printed output of the toll. Can be useful when using the option above!" />
        <param argument="--include_singletons" type="boolean" truevalue="--include_singletons" falsevalue="" checked="False"
            label="Include BGCs with lower cutoff distance" 
            help="With this option you can include BGCs who don't have a distance lower than the cutoff distance specified" /> 
        <param argument="--domain_overlap_cutoff" type="float" value="0.1" min="0.0" max="1.0"
            label="Specify when domains are considered to overlap"
            help="When using this option you can specify at which percentage domains are considered to overlap. The domain with the best score is kept. The default value is 0.1" />
        <param argument="--min_big_size" type="integer" value="0" 
            label="Minimum size of a BGC (bp)"
            help="Minimum size of a BGC to be included in the analysis. The Default value is 0 base pairs. This also includes the sum of all loci in a multi-record GenBank file." />
        <param argument="--mix" type="boolean" truevalue="--mix" falsevalue="" checked="False"
            label="mix all classes in the analysis"
            help="BiG-SCAPE separates the analysis according to the BGC product by default. If used BiG-SCAPE will mix all classes and analyse them." />
        <param argument="--no_classify" type="boolean" truevalue="--no_classify" falsevalue="" checked="False"
            label="No classified output based on the BGC product"
            help="By default, BiG-SCAPE classifies the output based on the BGC product. If toggled it will deactivate it. Note: when (--mix) is not activated, BiG-SCAPE will not create any network file!" />
        <param argument="--banned_classes" type="select" optional="true" multiple="true" display="checkboxes"
            label="Excluded classes from classification in BiG-SCAPE"
            help="You can exclude any of these classes to not be classified. Multiple banned classes are possible." >
            <option value="PKSI" selected="False">PKSI</option>
            <option value="PKSother" selected="False">PKSother</option>
            <option value="NRPS" selected="False">NRPS</option>
            <option value="RIPPs" selected="False">RIPPs</option>
            <option value="Saccharides" selected="False">Saccharides</option>
            <option value="Terpene" selected="False">Terpene</option>
            <option value="PKS-NRP_Hybrids" selected="False">PKS-NRP_Hybrids</option>
            <option value="Others" selected="False">Others</option>
        </param>
        <repeat name="cutoff" title="Cutoffs" default="1"
             help="Generate networks using multiple raw distance cutoff values. The default value here is 0.3.">
            <param argument="--cutoffs" type="float" value="0.3" min="0.1" max="1.0"
                label="GCF cutoff value"/>
        </repeat>
        <param argument="--clans-off" type="boolean" truevalue="--clans-off" falsevalue="" checked="False"
            label="Turn off cluster GCFs into GCCs"
            help="By default, BiG-SCAPE will perform a second layer of clustering to group GCFs into GCCs. Toggle to deactivate this." />
        <conditional name="clan_cutoff" >
            <param name="is_select" type="select" label="Change cutoff values for cluster GCF into GCC?"
                help="Select yes if you want to change cutoffs for cluster families into clans.">
                <option value="yes" selected="False">Yes</option>
                <option value="no" selected="True">No</option>
            </param>
            <when value="yes">
                <param name="clan_cutoff_val1" type="float" value="0.3" min="0.1" max="1.0" label="GCF cutoff value"
                    help="This value is for finding GCFs which will be used for clan calling. The default value is 0.3." />
                <param name="clan_cutoff_val2" type="float" value="0.7" min="0.1" max="1.0" label="GCC cutoff value"
                    help="This value is for clustering families into clans. The default value is 0.7. Every pair of GCFs linked with a distance of this value or less will be taken into account!" />
            </when>
            <when value="no" />
        </conditional>
        <param argument="--hybrids-off" type="boolean" truevalue="--hybrids-off" falsevalue="" checked="False"
            label="Exclude hybrid predicted products"
            help="By default, BGCs with hybrid predicted products from the PKS/NRPS Hybrids and Others classes will be included in each subclass. Since the same cluster can appear in different classes you can turn this off here." />
        <param argument="--mode" type="select"
            label="Alignment Mode"
            help="Here you can choose between 3 Alignment Mode which is used in comparing each pair of gene clusters. For more information look into the help section!" >
            <option value="glocal">glocal</option>
            <option value="global">global</option>
            <option value="auto">auto</option>
        </param>
        <conditional name="anchor">
            <param name="is_select" type="select" label="Change Anchorfile?"
                help="Select yes if you want to use an Anchorfile. BiG-SCAPE has a default file which is always used and only these domains are counted in the result. Look in the help section to see what are the default domains!" >
                <option value="yes" selected="False">Yes</option>
                <option value="no" selected="True">No</option>
            </param>
            <when value="yes">
                <param argument="--anchorfile" type="data" format="txt" optional="false"
                    label="Using a different Anchorfile instead of default file"
                    help="Use a custom Anchorfile (in .txt format) to give certain domains a special weight in the DSS index. This file is important because otherwise certain domains (given from Pfam IDs) will not be listed in the results!" />
            </when>
            <when value="no" />
        </conditional>
        <param argument="--force_hmmscan" type="boolean" truevalue="--force_hmmscan" falsevalue="" checked="False"
            label="Use hmmscan for the domain prediction"
            help="Even if BiG-SCAPE finds processed domtables files we can force the domain prediction with hmmscan. Toggle to force hmmscan!" />
        <conditional name="list">
            <param name="is_select" type="select" label="Use a domain list to include domains?"
                help="Select yes if you want to use a list to filter the input to certain domains. Look in the help section how this list should look like">
                <option value="yes" selected="False">Yes</option>
                <option value="no" selected="True">No</option>
            </param>
            <when value="yes">
                <param argument="--domain_includelist" type="data" format="txt" optional="false"
                    label="Use a .txt file to include domains"
                    help="Upload a text file where the inputs are filtered based on the Pfam IDs/domains given in this file!" />
            </when>
            <when value="no" />
        </conditional>
    </inputs>
    <outputs>
        <data name="html" format="html" label="${tool.name}: HTML"/> 
        <collection name="tsv_collection_1" type="list" format="tabular" label="${tool.name}: NETWORK ANNOTATIONS COLLECTION">
            <discover_datasets pattern="Network_Annotations_(?P&lt;designation&gt;.+)\.tsv" directory="result/network_files" recurse="true"/>
        </collection>
        <collection name="tsv_collection_2" type="list" format="tabular" label="${tool.name}: CLAN TABULAR FILES COLLECTION">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_clans_(?P&lt;designation&gt;.+)\.tsv" directory="result/network_files" recurse="true"/>
            <filter> clans_off == False </filter>
        </collection>
        <collection name="tsv_collection_3" type="list:list" format="tabular" label="${tool.name}: CLUSTERING TABULAR FILES COLLECTION">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_clustering_(?P&lt;identifier_1&gt;.+)\.tsv" directory="result/network_files" recurse="true"/>
        </collection>
        <collection name="newtwork_collection" type="list" format="network" label="${tool.name}: NETWORK FILES COLLECTION">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.network" directory="result/network_files" recurse="true"/>
        </collection>
        <data name="logfile" format="txt" label="${tool.name}: LOG FILE">
            <filter>log == True</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="5">
            <param name="pfam_dir" value="Pfam-A.hmm" ftype="hmm3"/>
            <param name="inputdir" value="NC_010530.1.region005.gbk,NC_012963.1.region001.gbk,NW_009799099.1.region003.gbk,NW_021940918.1.region003.gbk,NW_009799102.1.region001.gbk,NW_022985561.1.region002.gbk,NW_022985549.1.region005.gbk,NW_022985575.1.region001.gbk" ftype="genbank" />
        </test>
        <test expect_num_outputs="6">
            <param name="pfam_dir" value="Pfam-A.hmm" ftype="hmm3"/>
            <param name="inputdir" value="NC_010530.1.region005.gbk,NC_012963.1.region001.gbk,NW_009799099.1.region003.gbk,NW_021940918.1.region003.gbk,NW_009799102.1.region001.gbk,NW_022985561.1.region002.gbk,NW_022985549.1.region005.gbk,NW_022985575.1.region001.gbk" ftype="genbank" />
            <param name="log" value="true" />
            <repeat name="cutoff">
                <param name="cutoffs" value="1.0"/>
            </repeat>
            <output name="html" >
                <assert_contents>
                    <has_line line="&lt;!DOCTYPE html&gt;" />
                </assert_contents>
            </output>
            <output name="logfile">
                <assert_contents>
                        <has_line line="   - - Processing input files - -" />
                </assert_contents>
            </output>
            <output_collection name="tsv_collection_1" type="list">
                <element name="Full" >
                    <assert_contents>
                        <has_text text="Accession ID" n="1"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="tsv_collection_2" type="list">
                <element name="NRPS">
                    <assert_contents>
                        <has_text text="Clan Number" n="1"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="tsv_collection_3" type="list:list">
                <element name="NRPS">
                    <element name="c0.30">
                        <assert_contents>
                            <has_text text="Family Number" n="1" />
                        </assert_contents>
                    </element>
                    <element name="c1.00">
                        <assert_contents>
                            <has_text text="Family Number" n="1" />
                        </assert_contents>
                    </element>
                </element>
            </output_collection>
            <output_collection name="newtwork_collection" type="list">
                <element name="NRPS_c0.30">
                    <assert_contents>
                        <has_text text="Raw distance" n="1"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="5">
            <param name="pfam_dir" value="Pfam-A.hmm" ftype="hmm3"/>
            <param name="inputdir" value="NC_010530.1.region005.gbk,NC_012963.1.region001.gbk,NW_009799099.1.region003.gbk,NW_021940918.1.region003.gbk,NW_009799102.1.region001.gbk,NW_022985561.1.region002.gbk,NW_022985549.1.region005.gbk,NW_022985575.1.region001.gbk" ftype="genbank" />
            <param name="log" value="true" />
            <param name="clans_off" value="true" />
        </test>
    </tests>
    <help> 
    <![CDATA[
        .. class:: infomark

        **What is BiG-SCAPE**

        BiG-SCAPE (Biosynthetic Gene Similarity Clustering and Prospecting Engine) is a software package, written in Python, that constructs sequence similarity networks of Biosynthetic Gene Clusters (BGCs) and groups them into Gene Cluster Families (GCFs).

        .. class:. infomark

        **What it does**

        BiG-SCAPE does this by rapidly calculating a distance matrix between gene clusters based on a comparison of their protein domain content, order, copy number and sequence identity.

        In principle, BiG-SCAPE can also be used on any other gene clusters, such as pathogenicity islands, secretion system-encoding gene clusters, or even whole viral genomes.

        Here is a grapic how BiG-SCAPE works:

        .. image:: bigscape_corason.png
            :alt: BiG-SCAPE + CORASON workflow

        For more information you can visit `BiG-SCAPE on GitHub <https://github.com/medema-group/BiG-SCAPE>`_ or go on the `combine website <https://bigscape-corason.secondarymetabolites.org/index.html>`_.

        **Input**

        BiG-SCAPE uses two kind of inputs:

        - The genbank files from antiSMASH

        .. class:: infomark

        Note: By default, BiG-SCAPE includes any Genbank file where the filename contains either region or cluster. To ensure every file will be included we add 'region.' in in the file name!

        - The Pfamm-A.hmm file

        .. class:: infomark

        Note: You can download `Pfam-A.hmm.gz <https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz>`_ here and then unzip it or you can use the command: *$ wget https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz && gunzip Pfam-A.hmm.gz* in e.g. VSC.

        There are two additional inputs which can be used:

        - An anchor_domains.txt file

        .. class:: infomark:

        Example (default file which will be used):

        ::
        
         PF00668	Condensation domain [NRPS]
         PF00501	AMP-binding enzyme [NRPS]
         PF00109	Beta-ketoacyl synthase N-terminal [PKS]
         PF02801	Beta-ketoacyl synthase C-terminal [PKS]
         PF01397	Terpene synthase, N-terminal domain (Terpene_synth) [Terpene]
         PF03936	Terpene synthase family, metal binding domain (Terpene_synth_C) [Terpene]
         PF00195	Chalcone and stilbene synthases, N-terminal domain (Cahl_sti_synt_N)
         PF02797	Chalcone and stilbene synthases, C-terminal domain (Chal_sti_synt_C)
         PF05147	Lanthionine synthetase C-like protein (LANC_like) [lantipeptide/RiPP]
         PF00494	Squalene/phytoene synthase (SQS_PSY) [Terpene]
         PF00432	Prenyltransferase and squalene oxidase repeat (Prenyltrans)
         PF02624	YcaO cyclodehydratase, ATP-ad MG2+-binding (YcaO) [RiPP]
        
        The first column contains the Pfam model ID while the second column is optionally for writing a comment. The columns are tab-separated!

        - A domain_includelist.txt

        .. class:: infomark

        Example:

        ::

         PF00067    Cytochrome P450
         PF01451    Any Comment

        The first column contains the Pfam model ID while the second column is optionally for writing a comment. The columns are tab-separated and any line that starts with a # will be ignored!


        **Output**

        BiG-SCAPE will produce one HTML Output together with a dataset with different tabular files depending on the input. When the log file option is set it will create another output, where all prints made from this tool are stored.


        **Additionally information for the alignment Mode**

        - glocal: This is the default mode. Here the subset of the domains used to calculate distance is redefined by finding the longest slice of common domain content per gene in both BGCs, and then expanding each slice.

        - global: The whole list of domains of each BGC are compared.

        - auto: Use glocal mode when at least one of the BGCs in each pair has the contig_edge annotation from antiSMASH. Otherwise global will be used.
    ]]>
    </help>
    <citations>
        <citation type="doi">10.1038/s41589-019-0400-9</citation>
    </citations>
</tool>