view cami_amber.xml @ 5:e30bc6da7a36 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/amber/ commit 04a067550f7d61ca8ff489e3de64efd0da3abcf1
author iuc
date Sun, 08 Sep 2024 14:37:28 +0000
parents 6c71acde9d52
children
line wrap: on
line source

<tool id="cami_amber" name="CAMI AMBER" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Evaluation package for MAGs</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code">
    <![CDATA[

        #set $path_to_html = $html.files_path
        #set $max = []
        #set $min = []
        #set $label = []

        mkdir -p output inputs '$path_to_html' &&

        #if $tox.ncbi.is_select == 'yes':
            #if $tox.input.is_select == 'manually':
                mkdir -p ncbi &&
                #for $file in $tox.ncbi_dir:
                    ln -s '$file' './ncbi/$file.element_identifier' &&
                #end for
            #end if
        #end if

        #for $i, $file in enumerate($input_files):
            ln -s '$file.binning_files' './inputs/${i}.tsv' &&
            #if $file.labels:
                $label.append($file.labels)
            #end if 
        #end for

        #if $genome.thresholds:
            #for $i, $arg in enumerate($genome.thresholds):
                #if $arg.min_completeness:
                    $min.append($arg.min_completeness)
                #end if
                #if $arg.max_contamination:
                    $max.append($arg.max_contamination)
                #end if
            #end for
        #end if

        amber.py
        -g '${gold_standard_file}'
        #if $label:
            #set $sep = ''
            -l
            '
            #for $lab in $label:
                $sep$lab
                #set $sep = ','
            #end for
            '
        #end if
        -p ${filter}
        #if $min_length:
            -n $min_length
        #end if
        #if $desc:
            -d '${desc}'
        #end if
        #if $min:
            #set $sep = ''
            --min_completeness
            '
            #for $i in $min:
                $sep$i
                #set $sep = ','
            #end for
            '
        #end if
        #if $max:
            #set $sep = ''
            --max_contamination
            '
            #for $i in $max:
                $sep$i
                #set$sep = ','
            #end for
            '
        #end if
        #if $genome.remove_genomes:
            -r '$genome.remove_genomes'
        #end if
        #if $genome.remove.is_select == 'yes':
            -k '$genome.keyword'
        #end if
        #if $genome.genome_coverage:
            --genome_coverage '$genome.genome_coverage'
        #end if
        #if $tox.ncbi.is_select == 'yes':
            #if $tox.input.is_select == 'manually':
                --ncbi_dir ncbi
            #else:
                --ncbi_dir '$tox.ncbi_dir.fields.path'
            #end if
        #end if
        -o output
        #for $i, $bin in enumerate($input_files):
            'inputs/${i}.tsv'
        #end for

        &&

        mv 'output/heatmap_bar.png' '$path_to_html'

    ]]>
    </command>
    <inputs>
        <param argument="--gold_standard_file" format="tabular" type="data"
            label="Mapping of contigs or reads"
            help="Input the gold standard file here so amber know the correct IDs for each contig/read" />
        <repeat name="input_files" title="Binning files and names "
            help="Enter multiple binning files and names (names are optional). IMPORTANT: for each binning file you use in the program you need to state one label, this mean for example for 3 binning files you need 3 labels (3 slots) not more or less!">
            <param name="binning_files" format="tabular" type="data"
                label="Input bin files here" /> 
            <param argument="--labels" type="text" value="" optional="true"
                label="Name for bin" />
        </repeat>
        <param argument="--filter" type="integer" value="0" min="0"
            label="Filter out the n smallest genome bins"
            help="Optional filter for filter out the n smallest genome bins" />
        <param argument="--min_length" type="integer" value="" optional="true"
            label="Minimum length of sequences"
            help="Input how long the sequences has to be" />
        <param argument="--desc" type="text" value=""
            label="HTML description"
            help="Enter the HTML page description here" />
        <section name="genome" title="Genome binning-specific options" >
            <repeat name="thresholds" title="Min. completeness and max. contamination thresholds" 
                help="Enter certain thresholds for min. completeness (Default %: 50,70,90) and certain thresholds for max. contamination (Default %: 5, 10), the program itself will transform it to %!" >
                <param argument="--min_completeness" type="integer" value="" min="0" max="100" optional="true"
                    label="Min. completeness threshold" />
                <param argument="--max_contamination" type="integer" value="" min="0" max="100" optional="true"
                    label="Max. contamination threshold" />
            </repeat>
            <param argument="--remove_genomes" type="data" format="tabular" optional="true" 
                label="tsv file for genomes to remove"
                help="Input a tsv file with binid and type in each line. In the help section is an example. WARNING: IF THE LIST CONTAIN ALL GENOME THE PROGRAM WILL FAIL!" />
            <conditional name="remove">
                <param name="is_select" type="select" label="Remove one or all genomes which are in the list?"
                    help="Select yes and enter a keyword to remove certain type of genomes which are in the list. When all genomes in the list should remove also select yes and do not enter a keyword!">
                    <option value="yes" selected="false">Yes</option>
                    <option value="no" selected="true">No</option>
                </param>
                <when value="yes">
                    <param argument="--keyword" type="text" value="" 
                        label="Keyword for removing certain genomes"
                        help="Input a keyword which should be match with binid giving in the file for removing genomes. When no keyword is giving the program remove all genomes which are in the list!" />
                </when>
                <when value="no" />
            </conditional>
            <param argument="--genome_coverage" type="data" format="tabular" optional="true"
                label="Genome coverages tsv file"
                help="Input a tsv file where the genome coverage is stated. Look at the help section to see how this file should look like!" />
        </section>
        <section name="tox" title="taxonomic binning-specific option">
            <conditional name="ncbi">
                <param name="is_select" type="select" label="Want to use the NCBI database?"
                    help="Select yes if you want to use the NCBI database to evaluate taxonomic binning">
                    <option value="yes" selected="false">Yes</option>
                    <option value="no" selected="true">No</option>
                </param>
                <when value="yes"> 
                    <conditional name="input">
                        <param name="is_select" type="select" label="Select how you want to use NCBI database"
                            help="Either select manually input or data manager. For more help read the help of the type of the input you selected." >
                            <option value="manually">Manually</option>
                            <option value="data">Data manager</option>
                        </param>
                        <when value="manually">
                            <param argument="--ncbi_dir" format="tabular" type="data" multiple="true"
                                label="Input .dmp files"
                                help="To use the NCBI database we need to provide followed .dmp files: nodes.dmp, names.dmp and merged.dmp. You can get the via download of this file **ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz** and unzip it!" />
                        </when>
                        <when value="data" >  
                            <param argument="--ncbi_dir" type="select"
                                label="Assessing taxonomic binning"
                                help="Include the NCBI taxonomy database. For this you can use the data manager data_manager_fetch_ncbi_taxonomy which can be install via galaxy. For more help look at the help section at the bottom!" >
                                <options from_data_table="ncbi_taxonomy">
                                    <validator message="No NCBI database is available" type="no_options"/>
                                </options>
                            </param>
                        </when>
                    </conditional> 
                </when>
                <when value="no"/>
            </conditional>
        </section>
    </inputs>
    <outputs>
        <data name="html" format="html" from_work_dir="output/index.html" label="${tool.name}: HTML" />
        <data name="result" format="tabular" from_work_dir="output/results.tsv" label="${tool.name}: Results" />
        <data name="metrics_genome" format="tabular" from_work_dir="output/genome_metrics_cami1.tsv" label="${tool.name}: Genome metrics" />
        <data name="metrics_bin" format="tabular" from_work_dir="output/bin_metrics.tsv" label="${tool.name}: Bin metrics" />
    </outputs>
    <tests>
        <test expect_num_outputs="4">
            <param name="gold_standard_file" value="gsa_mapping.binning" ftype="tabular" />
            <repeat name="input_files">
                <param name="binning_files" value="elated_franklin_0" ftype="tabular"/>
            </repeat>
            <repeat name="input_files">
                <param name="binning_files" value="goofy_hypatia_2" ftype="tabular"/>
            </repeat>
            <repeat name="input_files">
                <param name="binning_files" value="naughty_carson_2" ftype="tabular"/>
            </repeat>
            <output name="metrics_bin">
                <assert_contents>
                    <has_text text="genome_id" n="1" />
                    <has_text text="sample_id" n="1" />
                </assert_contents>
            </output> 
            <output name="metrics_genome">
                <assert_contents>
                    <has_text text="precision_seq" n="1" />
                    <has_text text="total_length" n="1" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="4">
            <param name="gold_standard_file" value="gsa_mapping.binning" ftype="tabular" />
            <repeat name="input_files">
                <param name="binning_files" value="elated_franklin_0" ftype="tabular"/>
                <param name="labels" value="test1" />
            </repeat>
            <repeat name="input_files">
                <param name="binning_files" value="goofy_hypatia_2" ftype="tabular"/>
                <param name="labels" value="test2" />
            </repeat>
            <repeat name="input_files">
                <param name="binning_files" value="naughty_carson_2" ftype="tabular"/>
                <param name="labels" value="test3" />
            </repeat>
            <param name="filter" value="1" />
            <param name="min_length" value="200" />
            <section name="genome" >
                <repeat name="thresholds" >
                    <param name="max_contamination" value="2" />
                    <param name="min_completeness" value="50" />
                </repeat>
                <repeat name="thresholds" >
                    <param name="min_completeness" value="70" />
                </repeat>
                <repeat name="thresholds" >
                    <param name="min_completeness" value="90" />
                </repeat>
                <param name="genome_coverage" value="cami2_mouse_gut_average_genome_coverage.tsv" ftype="tabular" />
                <param name="remove_genomes" value="unique_common.tsv" ftype="tabular" />
                <conditional name="remove" >
                    <param name="is_select" value="yes" />
                    <param name="keyword" value="circular element" />
                </conditional>
            </section>
            <section name="tox">
                <conditional name="ncbi">
                    <param name="is_select" value="yes" />
                    <conditional name="input" >
                        <param name="is_select" value="manually" />
                        <param name="ncbi_dir" value="test-db/nodes.dmp,test-db/merged.dmp,test-db/names.dmp" ftype="tabular" />
                    </conditional> 
                </conditional>
            </section> 
            <output name="result">
                <assert_contents>
                    <has_text text="test1" n="1" />
                    <has_text text="test2" n="1" />
                    <has_text text="test3" n="1" />
                </assert_contents>
            </output> 
        </test>
        <test expect_num_outputs="4">
            <param name="gold_standard_file" value="gsa_mapping.binning" ftype="tabular" />
            <repeat name="input_files">
                <param name="binning_files" value="elated_franklin_0" ftype="tabular"/>
            </repeat>
            <repeat name="input_files">
                <param name="binning_files" value="goofy_hypatia_2" ftype="tabular"/>
            </repeat>
            <repeat name="input_files">
                <param name="binning_files" value="naughty_carson_2" ftype="tabular"/>
            </repeat>
            <param name="desc" value="TEST FOR GALAXY" />
            <section name="tox">
                <conditional name="ncbi">
                    <param name="is_select" value="yes" />
                    <conditional name="input" >
                        <param name="is_select" value="data" />
                        <param name="ncbi_dir" value="test-db-tox" />
                    </conditional> 
                </conditional>
            </section> 
            <output name="html">
                <assert_contents>
                    <has_text text="TEST FOR GALAXY" n="1" />
                </assert_contents>
            </output>
        </test><test expect_num_outputs="4">
            <param name="gold_standard_file" value="test_gold.tsv" ftype="tabular" />
            <repeat name="input_files">
                <param name="binning_files" value="test_binning.tsv" ftype="tabular"/>
            </repeat>
            <repeat name="input_files">
                <param name="binning_files" value="test_binning2.tsv" ftype="tabular"/>
            </repeat>
            <section name="tox">
                <conditional name="ncbi">
                    <param name="is_select" value="yes" />
                    <conditional name="input" >
                        <param name="is_select" value="data" />
                        <param name="ncbi_dir" value="test-db-tox" />
                    </conditional> 
                </conditional>
            </section> 
            <output name="metrics_bin">
                <assert_contents>
                    <has_text text="genome_id" n="1" />
                    <has_text text="sample_id" n="1" />
                </assert_contents>
            </output> 
            <output name="metrics_genome">
                <assert_contents>
                    <has_text text="precision_seq" n="1" />
                    <has_text text="total_length" n="1" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
    <![CDATA[
        .. class:: infomark

        **What is AMBER**

        AMBER is an evaluation package for the comparative assessment of genome reconstructions and taxonomic assignments from metagenome benchmark datasets.

        .. class:: infomark

        **What it does**

        AMBER calculate multiple metrics per bin and multiple metrics per sample. Each of them are provided then in results rankings, and comparative visualizations for assessing multiple programs or parameter effects.

        For more information please visit `AMBER in GitHub <https://github.com/CAMI-challenge/AMBER>`_.

        **Input**

        AMBER use only 2 required inputs:

        - The golden standard file (biobox format)

        This file can be created via the add_length tool

        .. class:: infomark

        Example(tab separated)

        ::
        
         @Version:0.9.1
         @SampleID:CAMI_low

         @@SEQUENCEID	BINID	_LENGTH
         RL|S1|C10817	Sample18_57	20518
         RL|S1|C11497	Sample22_57	37672
         RL|S1|C6571	evo_1286_AP.033	69914
         RL|S1|C10560	evo_1286_AP.033	995657
         RL|S1|C13546	evo_1286_AP.033	626775

        Note: This file looks similar to the binning files but the only different which is also is important is the length column.

        - Multiple binning files (biobox format)

        Files can be created via the convert_to_biobox tool

        .. class:: infomark

        Example(tab separated):

        ::

         #CAMI Format for Binning
         @Version:0.9.0
         @SampleID:CAMI_low
         @@SEQUENCEID	BINID
         RL|S1|C10	Bin_034
         RL|S1|C100	Bin_023
         RL|S1|C1000	Bin_034
         RL|S1|C10000	Bin_019
         RL|S1|C10002	Bin_035
         RL|S1|C10004	Bin_035
         RL|S1|C10008	Bin_034
         RL|S1|C10011	Bin_035
         RL|S1|C10012	Bin_013
         RL|S1|C10014	Bin_035

        There are also additional inputs which can be used:

        - A genome list which should be removed(tabular format)

        .. class:: infomark

        Example(tab separated): 

        ::

         evo_1035930.029	common strain
         1035930	common strain
         evo_1035930.032	common strain
         evo_1035930.011	common strain
         evo_1286_AP.033	common strain
         1286_AP	common strain
         evo_1286_AP.026	common strain
         evo_1286_AP.037	common strain
         evo_1286_AP.008	common strain
         1052944	common strain
         1053058	common strain
         1052947	common strain
         evo_1049056.013	common strain
         evo_1049056.031	common strain
         evo_1049056.011	common strain
         1049056	common strain
         evo_1049056.039	common strain

        Note: The first column contain the BINID and the second contain any kind of string. IMPORTANT: The argument where to state a keyword has to match to the anything in the second column to filter these kind of genomes out. If there is no keyword stated it can happen if the remove list contain all genomes which should be used then AMBER will fail since there are no genomes left to use!

        - Genome coverage file (tabular format)

        .. class:: infomark

        Example(tab separated):

        ::

         @SampleID:gsa_pooled
         @@GENOMEID	COVERAGE
         4378740.0	82.85111527272727
         4378740.1	27.159305090909097
         denovo10559.0	2.1596957142857143
         179927.0	1.6946866666666667
         denovo8373.1	2.07144
         136604.0	9.150489565217391
         denovo8373.0	1.1413460000000002
         269378.0	8.051563333333332
         190114.0	18.253119629629627
         228140.0	3.078681818181818
         135956.0	121.52672015625001
         259846.0	12.298210588235296
         162576.0	9.57867191489362
         184966.0	14.461031521739134

        There is also an option to include the NCBI database in AMBER. This can be used when including the link which is stated in the data manager: **data_manager_fetch_ncbi_taxonomy**. This data manager download the current version of the database and store all files for you. If there are questions about data manager maybe have a look at this `Tutorial <https://usegalaxy.eu/training-material/topics/admin/tutorials/reference-genomes/tutorial.html>`_.

        **Output**

        AMBER will output 3 tsv files where each metrics value is stated. The important output is the HTML file where all data are included and also can be visualized with certain plots!

        **Additional information**

        The package **Bokeh** will create warnings which can be ignored since the only tell you that certain functions in the code are swapped with the new functions. AMBER was tested with the stated version of **Bokeh** and with the newest version of it and both generate the same output!
    ]]>
    </help>
    <expand macro="citations" />
</tool>