diff lineage_wf.xml @ 0:760dc0c0e689 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/checkm commit 2a3b068a98bf0e913dc03e0d5c2182cfd102cf27
author iuc
date Fri, 29 Jul 2022 20:30:08 +0000
parents
children f0107b9f2dc3
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lineage_wf.xml	Fri Jul 29 20:30:08 2022 +0000
@@ -0,0 +1,295 @@
+<tool id="checkm_lineage_wf" name="CheckM lineage_wf" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>
+        Assessing the completeness and contamination of genome bins using lineage-specific marker sets
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="biotools"/>
+    <expand macro="requirements"/>
+    <expand macro="version"/>
+    <command detect_errors="exit_code"><![CDATA[
+@BIN_INPUTS@
+
+checkm lineage_wf
+    'bins'
+    'output'
+    $tree_analyze.reduced_tree
+    $tree_analyze.ali
+    $tree_analyze.nt
+    $tree_analyze.genes
+    --unique '$lineage_set.unique'
+    --multi '$lineage_set.multi'
+    $lineage_set.force_domain
+    $lineage_set.no_refinement
+    $qa.individual_markers
+    $qa.skip_adj_correction
+    $qa.skip_pseudogene_correction
+    --aai_strain $qa.aai_strain
+    $qa.ignore_thresholds
+    --e_value $qa.e_value
+    --length $qa.length
+    --file '$results'
+    --tab_table
+    --extension 'fasta'
+    --threads \${GALAXY_SLOTS:-1}
+    --pplacer_threads \${GALAXY_SLOTS:-1}
+    ]]></command>
+    <inputs>
+        <expand macro="bin_inputs" />
+        <section name="tree_analyze" title="Bin placement in the genome tree and marker gene identification">
+            <expand macro="tree_params" />
+        </section>
+        <section name="lineage_set" title="Bin lineage-specific marker set inference">
+            <expand macro="lineage_set_params" />
+        </section>
+        <section name="qa" title="Bin assessment">
+            <expand macro="qa_params" />
+        </section>
+        <param name="extra_outputs" type="select" multiple="true" optional="true" label="Extra outputs">
+            <option value="phylo_hmm_info">Phylogenetic HMM model info for each bin</option>
+            <option value="bin_stats_tree">Phylogenetic bin stats</option>
+            <option value="hmmer_tree">Phylogenetic HMM hits to each bin</option>
+            <option value="concatenated_tre">Concatenated tree</option>
+            <option value="concatenated_fasta">Concatenated masked sequences</option>
+            <expand macro="tree_extra_output_options" />
+            <option value="marker_file">Marker genes</option>
+            <option value="hmmer_analyze">Marker gene HMM hits to each bin</option>
+            <option value="bin_stats_analyze">Marker gene bin stats</option>
+            <option value="checkm_hmm_info">Marker gene HMM info for each bin</option>
+            <expand macro="analyze_extra_output_options" />
+            <option value="bin_stats_ext">Marker gene bin extensive stats</option>
+            <expand macro="qa_extra_output_options" />
+        </param>
+    </inputs>
+    <outputs>
+        <data name="results" format="tabular" label="${tool.name} on ${on_string}: Bin statistics"/>
+        <!--tree outputs-->
+        <data name="phylo_hmm_info" format="zip" from_work_dir="output/storage/phylo_hmm_info.pkl.gz" label="${tool.name} on ${on_string}: Phylogenetic HMM model info for each bin">
+            <filter>'phylo_hmm_info' in extra_outputs</filter>
+        </data>
+        <data name="bin_stats_tree" format="tabular" from_work_dir="output/storage/bin_stats.tree.tsv" label="${tool.name} on ${on_string}: Phylogenetic bin stats">
+            <filter>'bin_stats_tree' in extra_outputs</filter>
+        </data>
+        <collection name="hmmer_tree" type="list" label="${tool.name} on ${on_string}: Phylogenetic HMM hits to each bin">
+            <filter>'hmmer_tree' in extra_outputs</filter>
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)/hmmer\.tree\.txt" format="txt" directory="output/bins/" recurse="true" match_relative_path="true"/>
+        </collection>
+        <data name="concatenated_fasta" format="fasta" from_work_dir="output/storage/tree/concatenated.fasta" label="${tool.name} on ${on_string}: Concatenated masked sequences">
+            <filter>'concatenated_fasta' in extra_outputs</filter>
+        </data>
+        <data name="concatenated_tre" format="phyloxml" from_work_dir="output/storage/tree/concatenated.tre" label="${tool.name} on ${on_string}: Concatenated tree">
+            <filter>'concatenated_tre' in extra_outputs</filter>
+        </data>
+        <collection name="hmmer_tree_ali" type="list" label="${tool.name} on ${on_string}: Phylogenetic HMMER alignment file for each bin">
+            <filter>tree_analyze['ali'] and 'hmmer_tree_ali' in extra_outputs</filter>
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)/hmmer\.tree\.ali\.txt" format="txt" directory="output/bins/" recurse="true" match_relative_path="true"/>
+        </collection>
+        <data name="concatenated_pplacer_json" format="json" from_work_dir="output/storage/tree/concatenated.pplacer.json" label="${tool.name} on ${on_string}: Concatenated pplacer JSON">
+            <filter>'concatenate_pplacer_json' in extra_outputs</filter>
+        </data>
+        <collection name="genes_fna" type="list" label="${tool.name} on ${on_string}: Protein gene sequences for each bin">
+            <filter>not tree_analyze['genes'] and tree_analyze['nt'] and 'genes_fna' in extra_outputs</filter>
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)/genes\.fna" format="fasta" directory="output/bins/" recurse="true" match_relative_path="true"/>
+        </collection>
+        <collection name="genes_faa" type="list" label="${tool.name} on ${on_string}: Nucleotide gene sequences for each bin">
+            <filter>'genes_faa' in extra_outputs</filter>
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)/genes\.faa" format="fasta" directory="output/bins/" recurse="true" match_relative_path="true"/>
+        </collection>
+        <collection name="genes_gff" type="list" label="${tool.name} on ${on_string}: Gene feature files for each bin">
+            <filter>not tree_analyze['genes'] and 'genes_gff' in extra_outputs</filter>
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)/genes\.gff" format="gff" directory="output/bins/" recurse="true" match_relative_path="true"/>
+        </collection>
+        <!--lineage_set outputs-->
+        <data name="marker_file" format="tabular" from_work_dir="output/lineage.ms" label="${tool.name} on ${on_string}: Marker genes">
+            <filter>'marker_file' in extra_outputs</filter>
+        </data>
+        <!--analyze outputs-->
+        <collection name="hmmer_analyze" type="list" label="${tool.name} on ${on_string}: Marker gene HMM hits to each bin">
+            <filter>'hmmer_analyze' in extra_outputs</filter>
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)/hmmer\.analyze\.txt" format="txt" directory="output/bins/" recurse="true" match_relative_path="true"/>
+        </collection>
+        <data name="bin_stats_analyze" format="tabular" from_work_dir="output/storage/bin_stats.analyze.tsv" label="${tool.name} on ${on_string}: Marker gene bin stats">
+            <filter>'bin_stats_analyze' in extra_outputs</filter>
+        </data>
+        <data name="checkm_hmm_info" format="zip" from_work_dir="output/storage/checkm_hmm_info.pkl.gz" label="${tool.name} on ${on_string}: Marker gene HMM info for each bin" >
+            <filter>'checkm_hmm_info' in extra_outputs</filter>
+        </data>
+        <collection name="hmmer_analyze_ali" type="list" label="${tool.name} on ${on_string}: HMMER alignment file for each bin">
+            <filter>tree_analyze['ali'] and 'hmmer_analyze_ali' in extra_outputs</filter>
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)/hmmer\.analyze\.ali\.txt" format="txt" directory="output/bins/" recurse="true" match_relative_path="true"/>
+        </collection>
+        <!--qa outputs-->
+        <data name="bin_stats_ext" format="tabular" from_work_dir="output/storage/bin_stats_ext.tsv" label="${tool.name} on ${on_string}: Marker gene bin extensive stats">
+            <filter>'bin_stats_ext' in extra_outputs</filter>
+        </data>
+        <expand macro="qa_extra_outputs" />
+
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <conditional name="bins">
+                <param name="select" value="individual"/>
+                <param name="bins_ind" ftype="fasta" value="637000110.fna"/>
+            </conditional>
+            <section name="tree_analyze">
+                <param name="reduced_tree" value="true"/>
+                <param name="ali" value="false"/>
+                <param name="nt" value="false"/>
+                <param name="genes" value="false"/>
+            </section>
+            <section name="lineage_set">
+                <param name="unique" value="10"/>
+                <param name="multi" value="10"/>
+                <param name="force_domain" value="false"/>
+                <param name="no_refinement" value="false"/>
+            </section>
+            <section name="qa">
+                <param name="individual_markers" value="false"/>
+                <param name="skip_adj_correction" value="false"/>
+                <param name="skip_pseudogene_correction" value="false"/>
+                <param name="aai_strain" value="0.9"/>
+                <param name="ignore_thresholds" value="false"/>
+                <param name="e_value" value="1e-10"/>
+                <param name="length" value="0.7"/>
+            </section>
+            <param name="extra_outputs" value=""/>
+            <output name="results" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="Marker lineage"/>
+                    <has_text text="k__Bacteria"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="12">
+            <conditional name="bins">
+                <param name="select" value="collection"/>
+                <param name="bins_coll">
+                    <collection type="list">
+                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
+                    </collection>
+                </param>
+            </conditional>
+            <section name="tree_analyze">
+                <param name="reduced_tree" value="true"/>
+                <param name="ali" value="true"/>
+                <param name="nt" value="false"/>
+                <param name="genes" value="false"/>
+            </section>
+            <section name="lineage_set">
+                <param name="unique" value="10"/>
+                <param name="multi" value="10"/>
+                <param name="force_domain" value="false"/>
+                <param name="no_refinement" value="false"/>
+            </section>
+            <section name="qa">
+                <param name="individual_markers" value="false"/>
+                <param name="skip_adj_correction" value="false"/>
+                <param name="skip_pseudogene_correction" value="false"/>
+                <param name="aai_strain" value="0.9"/>
+                <param name="ignore_thresholds" value="false"/>
+                <param name="e_value" value="1e-10"/>
+                <param name="length" value="0.7"/>
+            </section>
+            <param name="extra_outputs" value="phylo_hmm_info,bin_stats_tree,hmmer_tree,concatenated_tre,concatenated_fasta,marker_file,hmmer_analyze,bin_stats_analyze,bin_stats_ext,checkm_hmm_info,marker_gene_stats"/>
+            <output name="results" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="Marker lineage"/>
+                    <has_text text="k__Bacteria"/>
+                </assert_contents>
+            </output>
+            <output name="phylo_hmm_info" ftype="zip">
+                <assert_contents>
+                    <has_size value="1575" delta="10"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_tree" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="Mean scaffold length"/>
+                    <has_text text="Translation table"/>
+                </assert_contents>
+            </output>
+            <output_collection name="hmmer_tree" count="1">
+                <element name="637000110" ftype="txt">
+                    <assert_contents>
+                        <has_text text="target name"/>
+                        <has_text text="AC_000091_79"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output name="concatenated_fasta" ftype="fasta">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="MLKAGVHFGHQ"/>
+                </assert_contents>
+            </output>
+            <output name="concatenated_tre" ftype="phyloxml">
+                <assert_contents>
+                    <has_text text="IMG_646564547"/>
+                    <has_text text="g__Methanocaldococcus"/>
+                </assert_contents>
+            </output>
+            <output name="marker_file" ftype="tabular">
+                <assert_contents>
+                    <has_text text="Lineage Marker File"/>
+                    <has_text text="637000110"/>
+                    <has_text text="k__Bacteria"/>
+                </assert_contents>
+            </output>
+            <output_collection name="hmmer_analyze" count="1">
+                <element name="637000110" ftype="txt">
+                    <assert_contents>
+                        <has_text text="target name"/>
+                        <has_text text="AC_000091_859"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output name="bin_stats_analyze" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="GC"/>
+                    <has_text text="GC std"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                </assert_contents>
+            </output>
+            <output name="checkm_hmm_info" ftype="zip">
+                <assert_contents>
+                    <has_size value="17052" delta="200"/>
+                </assert_contents>
+            </output>
+            <output name="marker_gene_stats" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="AC_000091_79"/>
+                    <has_text text="PF00318.15"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+@HELP_HEADER@
+
+This command runs the recommended workflow for assessing the completeness and contamination of genome bins is to use lineage-specific marker sets. 
+This workflow consists of 4 mandatory (M) steps and 1 recommended (R) step:
+
+- (M) The tree command places genome bins into a reference genome tree
+- (R) The tree_qa command indicates the number of phylogenetically informative marker genes found in each genome bin along with a taxonomic string indicating its approximate placement in the tree. 
+
+    If desired, genome bins with few phylogenetically marker genes may be removed in order to reduce the computational requirements of the following commands. 
+    Alternatively, if only genomes from a particular taxonomic group are of interest these can be moved to a new directory and analyzed separately. 
+
+- (M) The lineage_set command creates a marker file indicating lineage-specific marker sets suitable for evaluating each genome. 
+- (M) The analyze command identifies marker genes and estimates the completeness and contamination of each genome bin. 
+- (M) The qa command can be used to produce different tables summarizing the quality of each genome bin.
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>