view lineage_wf.xml @ 1:f0107b9f2dc3 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/checkm commit a96defec82bfa9628fa821ffa6df5c68314a41d8
author iuc
date Mon, 08 Aug 2022 19:59:56 +0000
parents 760dc0c0e689
children
line wrap: on
line source

<tool id="checkm_lineage_wf" name="CheckM lineage_wf" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        Assessing the completeness and contamination of genome bins using lineage-specific marker sets
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <expand macro="version"/>
    <command detect_errors="exit_code"><![CDATA[
@BIN_INPUTS@

checkm lineage_wf
    'bins'
    'output'
    $tree_analyze.reduced_tree
    $tree_analyze.ali
    $tree_analyze.nt
    $tree_analyze.genes
    --unique '$lineage_set.unique'
    --multi '$lineage_set.multi'
    $lineage_set.force_domain
    $lineage_set.no_refinement
    $qa.individual_markers
    $qa.skip_adj_correction
    $qa.skip_pseudogene_correction
    --aai_strain $qa.aai_strain
    $qa.ignore_thresholds
    --e_value $qa.e_value
    --length $qa.length
    --file '$results'
    --tab_table
    --extension 'fasta'
    --threads \${GALAXY_SLOTS:-1}
    --pplacer_threads \${GALAXY_SLOTS:-1}
    ]]></command>
    <inputs>
        <expand macro="bin_inputs" />
        <section name="tree_analyze" title="Bin placement in the genome tree and marker gene identification">
            <expand macro="tree_params" />
        </section>
        <section name="lineage_set" title="Bin lineage-specific marker set inference">
            <expand macro="lineage_set_params" />
        </section>
        <section name="qa" title="Bin assessment">
            <expand macro="qa_params" />
        </section>
        <param name="extra_outputs" type="select" multiple="true" optional="true" label="Extra outputs">
            <option value="phylo_hmm_info">Phylogenetic HMM model info for each bin</option>
            <option value="bin_stats_tree">Phylogenetic bin stats</option>
            <option value="hmmer_tree">Phylogenetic HMM hits to each bin</option>
            <option value="concatenated_tre">Concatenated tree</option>
            <option value="concatenated_fasta">Concatenated masked sequences</option>
            <expand macro="tree_extra_output_options" />
            <option value="marker_file">Marker genes</option>
            <option value="hmmer_analyze">Marker gene HMM hits to each bin</option>
            <option value="bin_stats_analyze">Marker gene bin stats</option>
            <option value="checkm_hmm_info">Marker gene HMM info for each bin</option>
            <expand macro="analyze_extra_output_options" />
            <option value="bin_stats_ext">Marker gene bin extensive stats</option>
            <expand macro="qa_extra_output_options" />
        </param>
    </inputs>
    <outputs>
        <data name="results" format="tabular" label="${tool.name} on ${on_string}: Bin statistics"/>
        <!--tree outputs-->
        <data name="phylo_hmm_info" format="zip" from_work_dir="output/storage/phylo_hmm_info.pkl.gz" label="${tool.name} on ${on_string}: Phylogenetic HMM model info for each bin">
            <filter>extra_outputs and 'phylo_hmm_info' in extra_outputs</filter>
        </data>
        <data name="bin_stats_tree" format="tabular" from_work_dir="output/storage/bin_stats.tree.tsv" label="${tool.name} on ${on_string}: Phylogenetic bin stats">
            <filter>extra_outputs and 'bin_stats_tree' in extra_outputs</filter>
        </data>
        <collection name="hmmer_tree" type="list" label="${tool.name} on ${on_string}: Phylogenetic HMM hits to each bin">
            <filter>extra_outputs and 'hmmer_tree' in extra_outputs</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)/hmmer\.tree\.txt" format="txt" directory="output/bins/" recurse="true" match_relative_path="true"/>
        </collection>
        <data name="concatenated_fasta" format="fasta" from_work_dir="output/storage/tree/concatenated.fasta" label="${tool.name} on ${on_string}: Concatenated masked sequences">
            <filter>extra_outputs and 'concatenated_fasta' in extra_outputs</filter>
        </data>
        <data name="concatenated_tre" format="phyloxml" from_work_dir="output/storage/tree/concatenated.tre" label="${tool.name} on ${on_string}: Concatenated tree">
            <filter>extra_outputs and 'concatenated_tre' in extra_outputs</filter>
        </data>
        <collection name="hmmer_tree_ali" type="list" label="${tool.name} on ${on_string}: Phylogenetic HMMER alignment file for each bin">
            <filter>tree_analyze['ali'] and extra_outputs and 'hmmer_tree_ali' in extra_outputs</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)/hmmer\.tree\.ali\.txt" format="txt" directory="output/bins/" recurse="true" match_relative_path="true"/>
        </collection>
        <data name="concatenated_pplacer_json" format="json" from_work_dir="output/storage/tree/concatenated.pplacer.json" label="${tool.name} on ${on_string}: Concatenated pplacer JSON">
            <filter>extra_outputs and 'concatenate_pplacer_json' in extra_outputs</filter>
        </data>
        <collection name="genes_fna" type="list" label="${tool.name} on ${on_string}: Protein gene sequences for each bin">
            <filter>not tree_analyze['genes'] and tree_analyze['nt'] and extra_outputs and 'genes_fna' in extra_outputs</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)/genes\.fna" format="fasta" directory="output/bins/" recurse="true" match_relative_path="true"/>
        </collection>
        <collection name="genes_faa" type="list" label="${tool.name} on ${on_string}: Nucleotide gene sequences for each bin">
            <filter>extra_outputs and 'genes_faa' in extra_outputs</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)/genes\.faa" format="fasta" directory="output/bins/" recurse="true" match_relative_path="true"/>
        </collection>
        <collection name="genes_gff" type="list" label="${tool.name} on ${on_string}: Gene feature files for each bin">
            <filter>not tree_analyze['genes'] and extra_outputs and 'genes_gff' in extra_outputs</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)/genes\.gff" format="gff" directory="output/bins/" recurse="true" match_relative_path="true"/>
        </collection>
        <!--lineage_set outputs-->
        <data name="marker_file" format="tabular" from_work_dir="output/lineage.ms" label="${tool.name} on ${on_string}: Marker genes">
            <filter>extra_outputs and 'marker_file' in extra_outputs</filter>
        </data>
        <!--analyze outputs-->
        <collection name="hmmer_analyze" type="list" label="${tool.name} on ${on_string}: Marker gene HMM hits to each bin">
            <filter>extra_outputs and 'hmmer_analyze' in extra_outputs</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)/hmmer\.analyze\.txt" format="txt" directory="output/bins/" recurse="true" match_relative_path="true"/>
        </collection>
        <data name="bin_stats_analyze" format="tabular" from_work_dir="output/storage/bin_stats.analyze.tsv" label="${tool.name} on ${on_string}: Marker gene bin stats">
            <filter>extra_outputs and 'bin_stats_analyze' in extra_outputs</filter>
        </data>
        <data name="checkm_hmm_info" format="zip" from_work_dir="output/storage/checkm_hmm_info.pkl.gz" label="${tool.name} on ${on_string}: Marker gene HMM info for each bin" >
            <filter>extra_outputs and 'checkm_hmm_info' in extra_outputs</filter>
        </data>
        <collection name="hmmer_analyze_ali" type="list" label="${tool.name} on ${on_string}: HMMER alignment file for each bin">
            <filter>tree_analyze['ali'] and extra_outputs and 'hmmer_analyze_ali' in extra_outputs</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)/hmmer\.analyze\.ali\.txt" format="txt" directory="output/bins/" recurse="true" match_relative_path="true"/>
        </collection>
        <!--qa outputs-->
        <data name="bin_stats_ext" format="tabular" from_work_dir="output/storage/bin_stats_ext.tsv" label="${tool.name} on ${on_string}: Marker gene bin extensive stats">
            <filter>extra_outputs and 'bin_stats_ext' in extra_outputs</filter>
        </data>
        <expand macro="qa_extra_outputs" />
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="individual"/>
                <param name="bins_ind" ftype="fasta" value="637000110.fna"/>
            </conditional>
            <section name="tree_analyze">
                <param name="reduced_tree" value="true"/>
                <param name="ali" value="false"/>
                <param name="nt" value="false"/>
                <param name="genes" value="false"/>
            </section>
            <section name="lineage_set">
                <param name="unique" value="10"/>
                <param name="multi" value="10"/>
                <param name="force_domain" value="false"/>
                <param name="no_refinement" value="false"/>
            </section>
            <section name="qa">
                <param name="individual_markers" value="false"/>
                <param name="skip_adj_correction" value="false"/>
                <param name="skip_pseudogene_correction" value="false"/>
                <param name="aai_strain" value="0.9"/>
                <param name="ignore_thresholds" value="false"/>
                <param name="e_value" value="1e-10"/>
                <param name="length" value="0.7"/>
            </section>
            <param name="extra_outputs" value=""/>
            <output name="results" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="Marker lineage"/>
                    <has_text text="k__Bacteria"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="12">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <section name="tree_analyze">
                <param name="reduced_tree" value="true"/>
                <param name="ali" value="true"/>
                <param name="nt" value="false"/>
                <param name="genes" value="false"/>
            </section>
            <section name="lineage_set">
                <param name="unique" value="10"/>
                <param name="multi" value="10"/>
                <param name="force_domain" value="false"/>
                <param name="no_refinement" value="false"/>
            </section>
            <section name="qa">
                <param name="individual_markers" value="false"/>
                <param name="skip_adj_correction" value="false"/>
                <param name="skip_pseudogene_correction" value="false"/>
                <param name="aai_strain" value="0.9"/>
                <param name="ignore_thresholds" value="false"/>
                <param name="e_value" value="1e-10"/>
                <param name="length" value="0.7"/>
            </section>
            <param name="extra_outputs" value="phylo_hmm_info,bin_stats_tree,hmmer_tree,concatenated_tre,concatenated_fasta,marker_file,hmmer_analyze,bin_stats_analyze,bin_stats_ext,checkm_hmm_info,marker_gene_stats"/>
            <output name="results" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="Marker lineage"/>
                    <has_text text="k__Bacteria"/>
                </assert_contents>
            </output>
            <output name="phylo_hmm_info" ftype="zip">
                <assert_contents>
                    <has_size value="1575" delta="10"/>
                </assert_contents>
            </output>
            <output name="bin_stats_tree" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="Mean scaffold length"/>
                    <has_text text="Translation table"/>
                </assert_contents>
            </output>
            <output_collection name="hmmer_tree" count="1">
                <element name="637000110" ftype="txt">
                    <assert_contents>
                        <has_text text="target name"/>
                        <has_text text="AC_000091_79"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output name="concatenated_fasta" ftype="fasta">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="MLKAGVHFGHQ"/>
                </assert_contents>
            </output>
            <output name="concatenated_tre" ftype="phyloxml">
                <assert_contents>
                    <has_text text="IMG_646564547"/>
                    <has_text text="g__Methanocaldococcus"/>
                </assert_contents>
            </output>
            <output name="marker_file" ftype="tabular">
                <assert_contents>
                    <has_text text="Lineage Marker File"/>
                    <has_text text="637000110"/>
                    <has_text text="k__Bacteria"/>
                </assert_contents>
            </output>
            <output_collection name="hmmer_analyze" count="1">
                <element name="637000110" ftype="txt">
                    <assert_contents>
                        <has_text text="target name"/>
                        <has_text text="AC_000091_859"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output name="bin_stats_analyze" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="GC"/>
                    <has_text text="GC std"/>
                </assert_contents>
            </output>
            <output name="bin_stats_ext" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="marker lineage"/>
                </assert_contents>
            </output>
            <output name="checkm_hmm_info" ftype="zip">
                <assert_contents>
                    <has_size value="17052" delta="200"/>
                </assert_contents>
            </output>
            <output name="marker_gene_stats" ftype="tabular">
                <assert_contents>
                    <has_text text="637000110"/>
                    <has_text text="AC_000091_79"/>
                    <has_text text="PF00318.15"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
@HELP_HEADER@

This command runs the recommended workflow for assessing the completeness and contamination of genome bins is to use lineage-specific marker sets. 
This workflow consists of 4 mandatory (M) steps and 1 recommended (R) step:

- (M) The tree command places genome bins into a reference genome tree
- (R) The tree_qa command indicates the number of phylogenetically informative marker genes found in each genome bin along with a taxonomic string indicating its approximate placement in the tree. 

    If desired, genome bins with few phylogenetically marker genes may be removed in order to reduce the computational requirements of the following commands. 
    Alternatively, if only genomes from a particular taxonomic group are of interest these can be moved to a new directory and analyzed separately. 

- (M) The lineage_set command creates a marker file indicating lineage-specific marker sets suitable for evaluating each genome. 
- (M) The analyze command identifies marker genes and estimates the completeness and contamination of each genome bin. 
- (M) The qa command can be used to produce different tables summarizing the quality of each genome bin.

    ]]></help>
    <expand macro="citations"/>
</tool>