view qa.xml @ 2:5466420816b9 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/checkm commit 3ae2bd72f789518e95ef65b97e0e5ac90165e113
author iuc
date Mon, 02 Sep 2024 13:51:10 +0000
parents 5c0493cdced9
children
line wrap: on
line source

<tool id="checkm_qa" name="CheckM qa" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        Assess bins for contamination and completeness
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <expand macro="version"/>
    <command detect_errors="exit_code"><![CDATA[
#import re
mkdir -p 'output/storage/' &&
cp '$checkm_hmm_info' 'output/storage/checkm_hmm_info.pkl.gz' &&
cp '$bin_stats_analyze' 'output/storage/bin_stats.analyze.tsv' &&
#for $i in $hmmer_analyze
    #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier))
mkdir -p 'output/bins/${identifier}' &&
cp '$i' 'output/bins/${identifier}/hmmer.analyze.txt' &&
#end for
#if $output.out_format == '9'
    #for $i in $output.genes_faa
        #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier))
mkdir -p 'output/bins/${identifier}' &&
cp '$i' 'output/bins/${identifier}/genes.faa' &&
    #end for
#end if

checkm qa
    '$marker_file'
    'output'
    --out_format $output.out_format
    --tab_table
    --file 'output_file'
#if $exclude_markers
    --exclude_markers '$exclude_markers'
#end if
    $individual_markers  
    $skip_adj_correction
    $skip_pseudogene_correction
    --aai_strain $aai_strain
    $ignore_thresholds   
    --e_value $e_value
    --length $length
#if $coverage 
    --coverage_file '$coverage'
#end if
    --threads \${GALAXY_SLOTS:-1}
    ]]></command>
    <inputs>
        <expand macro="marker_file" />
        <param name="checkm_hmm_info" type="data" format="zip" label="Marker gene HMM info for each bin" help="Output of the CheckM analyze tool" />
        <param name="bin_stats_analyze" type="data" format="tabular" label="Marker gene bin stats" help="Output of the CheckM analyze tool" />
        <param name="hmmer_analyze" type="data_collection" collection_type="list" format="txt" label="Marker gene HMM hits to each bin" help="Output of the CheckM analyze tool" />
        <conditional name="output">
            <param argument="--out_format" type="select" label="Desired output">
                <option value="1">Summary of bin completeness and contamination</option>
                <option value="2">Extended summary of bin statistics (includes GC, genome size, ...)</option>
                <option value="3">Summary of bin quality for increasingly basal lineage-specific marker sets</option>
                <option value="4">List of marker genes and their counts</option>
                <option value="5">List of bin id, marker gene id, gene id</option>
                <option value="6">List of marker genes present multiple times in a bin</option>
                <option value="7">List of marker genes present multiple times on the same scaffold</option>
                <option value="8">List indicating position of each marker gene within a bin</option>
                <option value="9">Marker genes identified in each bin and their sequence</option>
            </param>
            <when value="1"/>
            <when value="2"/>
            <when value="3"/>
            <when value="4"/>
            <when value="5"/>
            <when value="6"/>
            <when value="7"/>
            <when value="8"/>
            <when value="9">
                <param name="genes_faa" type="data_collection" collection_type="list" label="Nucleotide gene sequences for each bin"/>
            </when>
        </conditional>
        <param argument="exclude_markers" type="data" format="txt" optional="true" label="Markers to exclude from marker sets" />
        <expand macro="qa_params" />
        <param argument="coverage" type="data" format="txt" optional="true" label="Coverage of each sequence" help="Generated by the coverage command" />
        <param name="extra_outputs" type="select" multiple="true" optional="true" label="Extra outputs">
            <expand macro="qa_extra_output_options" />
        </param>
    </inputs>
    <outputs>
        <data name="output_f1" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Summary of bin completeness and contamination">
            <filter>output['out_format']=="1"</filter>
        </data>
        <data name="output_f2" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Extended summary of bin statistics">
            <filter>output['out_format']=="2"</filter>
        </data>
        <data name="output_f3" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Summary of bin quality for increasingly basal lineage-specific marker sets">
            <filter>output['out_format']=="3"</filter>
        </data>
        <data name="output_f4" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes and their counts">
            <filter>output['out_format']=="4"</filter>
        </data>
        <data name="output_f5" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Bin id, marker gene id, gene id">
            <filter>output['out_format']=="5"</filter>
        </data>
        <data name="output_f6" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes present multiple times in a bin">
            <filter>output['out_format']=="6"</filter>
        </data>
        <data name="output_f7" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes present multiple times on the same scaffold">
            <filter>output['out_format']=="7"</filter>
        </data>
        <data name="output_f8" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Indicating position of each marker gene within a bin">
            <filter>output['out_format']=="8"</filter>
        </data>
        <data name="output_f9" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes identified in each bin and their sequence">
            <filter>output['out_format']=="9"</filter>
        </data>
        <data name="bin_stats_ext" format="tabular" from_work_dir="output/storage/bin_stats_ext.tsv" label="${tool.name} on ${on_string}: Marker gene bin extensive stats"/>
        <expand macro="qa_extra_outputs" />
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="1"/>
            </conditional>
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value="marker_gene_stats"/>
            <output name="output_f1" ftype="tabular">
                <assert_contents>
                    <has_text text="Marker lineage"/>
                    <has_text text="637000110"/>
                    <has_text text="f__Enterobacteriaceae"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
            <!--<output name="alignment_file" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="Lineage Marker File"/>
                    <has_text text="UID5139"/>
                </assert_contents>
            </output>-->
            <output name="marker_gene_stats" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="AC_000091_751"/>
                    <has_text text="TIGR02432"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="2"/>
            </conditional>
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value=""/>
            <output name="output_f2" ftype="tabular">
                <assert_contents>
                    <has_text text="Marker lineage"/>
                    <has_text text="Mean scaffold length"/>
                    <has_text text="f__Enterobacteriaceae"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="3"/>
            </conditional>
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value=""/>
            <output name="output_f3" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="Strain heterogeneity"/>
                    <has_text text="UID5139"/>
                    <has_text text="p__Proteobacteria"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="4"/>
            </conditional>
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value=""/>
            <output name="output_f4" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="Node Id: UID5103; Marker lineage: f__Enterobacteriaceae"/>
                    <has_text text="PF02542.1"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="5"/>
            </conditional>
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value=""/>
            <output name="output_f5" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="TIGR02432"/>
                    <has_text text="AC_000091_165"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="6"/>
            </conditional>
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value=""/>
            <output name="output_f6" ftype="tabular">
                <assert_contents>
                    <has_text text="Marker Id"/>
                    <has_text text="No marker genes satisfied"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="7"/>
            </conditional>
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value=""/>
            <output name="output_f7" ftype="tabular">
                <assert_contents>
                    <has_text text="Marker Id"/>
                    <has_text text="No marker genes satisfied"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="8"/>
            </conditional>
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value=""/>
            <output name="output_f8" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="AC_000091_183"/>
                    <has_text text="TIGR02075,9,240"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
            <param name="hmmer_analyze">
                <collection type="list">
                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
                </collection>
            </param>
            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
            <conditional name="output">
                <param name="out_format" value="9"/>
                <param name="genes_faa">
                    <collection type="list">
                    <element name="637000110" ftype="fasta" value="637000110.faa"/>
                    </collection>
                </param>
            </conditional>
            <param name="exclude_markers" ftype="txt" value="markers_to_exclude" />
            <param name="individual_markers" value="false"/>
            <param name="skip_adj_correction" value="false"/>
            <param name="skip_pseudogene_correction" value="false"/>
            <param name="aai_strain" value="0.9"/>
            <param name="ignore_thresholds" value="false"/>
            <param name="e_value" value="1e-10"/>
            <param name="length" value="0.7"/>
            <param name="extra_outputs" value=""/>
            <output name="output_f9" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="Sequence"/>
                    <has_text text="PF06574.7"/>
                    <has_text text="MKLIRGI"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                    <has_text text="GCN0"/>
                    <has_text text="Longest contig"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
@HELP_HEADER@

This command identifies marker genes in bins and calculates genome statistics

Adjacent called genes matching the same marker gene may indicate a true duplication event, a gene calling error, or an assembly error. If adjacent genes hit distinct regions of the same marker gene HMM, CheckM assumes a gene calling error has occurred and concatenate the two genes. When this occurs, CheckM concatenates the gene ids of the two genes with a pair of ampersands (&&).

Outputs
=======
Output in function of selection output format

1. Summary of bin completeness, contamination, and strain heterogeneity
    Bin Id: bin identifier derived from input FASTA file
    Marker lineage: indicates lineage used for inferring marker set (a precise indication of where a bin was placed in CheckM's reference tree can be obtained with the tree_qa command)
    No. genomes: number of reference genomes used to infer marker set
    No. markers: number of inferred marker genes
    No. marker sets: number of inferred co-located marker sets
    0-5+: number of times each marker gene is identified
    Completeness: estimated completeness
    Contamination: estimated contamination
    Strain heterogeneity: estimated strain heterogeneity
2. Extended summary of bin quality (includes GC, genome size, coding density, ...)
3. Summary of bin quality for increasingly basal lineage-specific marker sets
    Node Id: unique id of internal node in genome tree from which lineage-specific markers were inferred
4. ist of marker genes for each bin along with the number of times each marker was identified
    Node Id: unique id of internal node in genome tree from which lineage-specific markers were inferred
    Marker lineage: indicates lineage used for inferring marker set
    Useful for identifying lineage-specific gene loss or duplication
5. List of bin id, marker gene id, and called gene id for each identified marker gene
6. List of marker genes present multiple times in a bin
7. List of marker genes present multiple times on the same scaffold
    Useful for identifying true gene duplication events, gene calling errors, or assembly errors. See note below.
8. List indicating the position of each marker genes within a bin
9. Marker genes identified in each bin and their sequence

    ]]></help>
    <expand macro="citations"/>
</tool>