diff qa.xml @ 0:5c0493cdced9 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/checkm commit 2a3b068a98bf0e913dc03e0d5c2182cfd102cf27
author iuc
date Fri, 29 Jul 2022 20:34:59 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa.xml	Fri Jul 29 20:34:59 2022 +0000
@@ -0,0 +1,500 @@
+<tool id="checkm_qa" name="CheckM qa" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>
+        Assess bins for contamination and completeness
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="biotools"/>
+    <expand macro="requirements"/>
+    <expand macro="version"/>
+    <command detect_errors="exit_code"><![CDATA[
+#import re
+mkdir -p 'output/storage/' &&
+cp '$checkm_hmm_info' 'output/storage/checkm_hmm_info.pkl.gz' &&
+cp '$bin_stats_analyze' 'output/storage/bin_stats.analyze.tsv' &&
+#for $i in $hmmer_analyze
+    #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier))
+mkdir -p 'output/bins/${identifier}' &&
+cp '$i' 'output/bins/${identifier}/hmmer.analyze.txt' &&
+#end for
+#if $output.out_format == '9'
+    #for $i in $output.genes_faa
+        #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier))
+mkdir -p 'output/bins/${identifier}' &&
+cp '$i' 'output/bins/${identifier}/genes.faa' &&
+    #end for
+#end if
+
+checkm qa
+    '$marker_file'
+    'output'
+    --out_format $output.out_format
+    --tab_table
+    --file 'output_file'
+#if $exclude_markers
+    --exclude_markers '$exclude_markers'
+#end if
+    $individual_markers  
+    $skip_adj_correction
+    $skip_pseudogene_correction
+    --aai_strain $aai_strain
+    $ignore_thresholds   
+    --e_value $e_value
+    --length $length
+#if $coverage 
+    --coverage_file '$coverage'
+#end if
+    --threads \${GALAXY_SLOTS:-1}
+    ]]></command>
+    <inputs>
+        <expand macro="marker_file" />
+        <param name="checkm_hmm_info" type="data" format="zip" label="Marker gene HMM info for each bin" help="Output of the CheckM analyze tool" />
+        <param name="bin_stats_analyze" type="data" format="tabular" label="Marker gene bin stats" help="Output of the CheckM analyze tool" />
+        <param name="hmmer_analyze" type="data_collection" collection_type="list" format="txt" label="Marker gene HMM hits to each bin" help="Output of the CheckM analyze tool" />
+        <conditional name="output">
+            <param argument="--out_format" type="select" label="Desired output">
+                <option value="1">Summary of bin completeness and contamination</option>
+                <option value="2">Extended summary of bin statistics (includes GC, genome size, ...)</option>
+                <option value="3">Summary of bin quality for increasingly basal lineage-specific marker sets</option>
+                <option value="4">List of marker genes and their counts</option>
+                <option value="5">List of bin id, marker gene id, gene id</option>
+                <option value="6">List of marker genes present multiple times in a bin</option>
+                <option value="7">List of marker genes present multiple times on the same scaffold</option>
+                <option value="8">List indicating position of each marker gene within a bin</option>
+                <option value="9">Marker genes identified in each bin and their sequence</option>
+            </param>
+            <when value="1"/>
+            <when value="2"/>
+            <when value="3"/>
+            <when value="4"/>
+            <when value="5"/>
+            <when value="6"/>
+            <when value="7"/>
+            <when value="8"/>
+            <when value="9">
+                <param name="genes_faa" type="data_collection" collection_type="list" label="Nucleotide gene sequences for each bin"/>
+            </when>
+        </conditional>
+        <param argument="exclude_markers" type="data" format="txt" optional="true" label="Markers to exclude from marker sets" />
+        <expand macro="qa_params" />
+        <param argument="coverage" type="data" format="txt" optional="true" label="Coverage of each sequence" help="Generated by the coverage command" />
+        <param name="extra_outputs" type="select" multiple="true" optional="true" label="Extra outputs">
+            <expand macro="qa_extra_output_options" />
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output_f1" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Summary of bin completeness and contamination">
+            <filter>output['out_format']=="1"</filter>
+        </data>
+        <data name="output_f2" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Extended summary of bin statistics">
+            <filter>output['out_format']=="2"</filter>
+        </data>
+        <data name="output_f3" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Summary of bin quality for increasingly basal lineage-specific marker sets">
+            <filter>output['out_format']=="3"</filter>
+        </data>
+        <data name="output_f4" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes and their counts">
+            <filter>output['out_format']=="4"</filter>
+        </data>
+        <data name="output_f5" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Bin id, marker gene id, gene id">
+            <filter>output['out_format']=="5"</filter>
+        </data>
+        <data name="output_f6" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes present multiple times in a bin">
+            <filter>output['out_format']=="6"</filter>
+        </data>
+        <data name="output_f7" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes present multiple times on the same scaffold">
+            <filter>output['out_format']=="7"</filter>
+        </data>
+        <data name="output_f8" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Indicating position of each marker gene within a bin">
+            <filter>output['out_format']=="8"</filter>
+        </data>
+        <data name="output_f9" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes identified in each bin and their sequence">
+            <filter>output['out_format']=="9"</filter>
+        </data>
+        <data name="bin_stats_ext" format="tabular" from_work_dir="output/storage/bin_stats_ext.tsv" label="${tool.name} on ${on_string}: Marker gene bin extensive stats"/>
+        <expand macro="qa_extra_outputs" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="3">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="1"/>
+            </conditional>
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value="marker_gene_stats"/>
+            <output name="output_f1" ftype="tabular">
+                <assert_contents>
+                    <has_text text="Marker lineage"/>
+                    <has_text text="637000110"/>
+                    <has_text text="f__Enterobacteriaceae"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+            <!--<output name="alignment_file" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="Lineage Marker File"/>
+                    <has_text text="UID5139"/>
+                </assert_contents>
+            </output>-->
+            <output name="marker_gene_stats" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="AC_000091_751"/>
+                    <has_text text="TIGR02432"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="2"/>
+            </conditional>
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value=""/>
+            <output name="output_f2" ftype="tabular">
+                <assert_contents>
+                    <has_text text="Marker lineage"/>
+                    <has_text text="Mean scaffold length"/>
+                    <has_text text="f__Enterobacteriaceae"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="3"/>
+            </conditional>
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value=""/>
+            <output name="output_f3" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="Strain heterogeneity"/>
+                    <has_text text="UID5139"/>
+                    <has_text text="p__Proteobacteria"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="4"/>
+            </conditional>
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value=""/>
+            <output name="output_f4" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="Node Id: UID5103; Marker lineage: f__Enterobacteriaceae"/>
+                    <has_text text="PF02542.1"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="5"/>
+            </conditional>
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value=""/>
+            <output name="output_f5" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="TIGR02432"/>
+                    <has_text text="AC_000091_165"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="6"/>
+            </conditional>
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value=""/>
+            <output name="output_f6" ftype="tabular">
+                <assert_contents>
+                    <has_text text="Marker Id"/>
+                    <has_text text="No marker genes satisfied"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="7"/>
+            </conditional>
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value=""/>
+            <output name="output_f7" ftype="tabular">
+                <assert_contents>
+                    <has_text text="Marker Id"/>
+                    <has_text text="No marker genes satisfied"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="8"/>
+            </conditional>
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value=""/>
+            <output name="output_f8" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="AC_000091_183"/>
+                    <has_text text="TIGR02075,9,240"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
+            <param name="hmmer_analyze">
+                <collection type="list">
+                   <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
+                </collection>
+            </param>
+            <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
+            <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
+            <conditional name="output">
+                <param name="out_format" value="9"/>
+                <param name="genes_faa">
+                    <collection type="list">
+                    <element name="637000110" ftype="fasta" value="637000110.faa"/>
+                    </collection>
+                </param>
+            </conditional>
+            <param name="exclude_markers" ftype="txt" value="markers_to_exclude" />
+            <param name="individual_markers" value="false"/>
+            <param name="skip_adj_correction" value="false"/>
+            <param name="skip_pseudogene_correction" value="false"/>
+            <param name="aai_strain" value="0.9"/>
+            <param name="ignore_thresholds" value="false"/>
+            <param name="e_value" value="1e-10"/>
+            <param name="length" value="0.7"/>
+            <param name="extra_outputs" value=""/>
+            <output name="output_f9" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="Sequence"/>
+                    <has_text text="PF06574.7"/>
+                    <has_text text="MKLIRGI"/>
+                </assert_contents>
+            </output>
+            <output name="bin_stats_ext" ftype="tabular">
+                <assert_contents>
+                    <has_text text="637000110"/>
+                    <has_text text="marker lineage"/>
+                    <has_text text="GCN0"/>
+                    <has_text text="Longest contig"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+@HELP_HEADER@
+
+This command identifies marker genes in bins and calculates genome statistics
+
+Adjacent called genes matching the same marker gene may indicate a true duplication event, a gene calling error, or an assembly error. If adjacent genes hit distinct regions of the same marker gene HMM, CheckM assumes a gene calling error has occurred and concatenate the two genes. When this occurs, CheckM concatenates the gene ids of the two genes with a pair of ampersands (&&).
+
+Outputs
+=======
+Output in function of selection output format
+
+1. Summary of bin completeness, contamination, and strain heterogeneity
+    Bin Id: bin identifier derived from input FASTA file
+    Marker lineage: indicates lineage used for inferring marker set (a precise indication of where a bin was placed in CheckM's reference tree can be obtained with the tree_qa command)
+    No. genomes: number of reference genomes used to infer marker set
+    No. markers: number of inferred marker genes
+    No. marker sets: number of inferred co-located marker sets
+    0-5+: number of times each marker gene is identified
+    Completeness: estimated completeness
+    Contamination: estimated contamination
+    Strain heterogeneity: estimated strain heterogeneity
+2. Extended summary of bin quality (includes GC, genome size, coding density, ...)
+3. Summary of bin quality for increasingly basal lineage-specific marker sets
+    Node Id: unique id of internal node in genome tree from which lineage-specific markers were inferred
+4. ist of marker genes for each bin along with the number of times each marker was identified
+    Node Id: unique id of internal node in genome tree from which lineage-specific markers were inferred
+    Marker lineage: indicates lineage used for inferring marker set
+    Useful for identifying lineage-specific gene loss or duplication
+5. List of bin id, marker gene id, and called gene id for each identified marker gene
+6. List of marker genes present multiple times in a bin
+7. List of marker genes present multiple times on the same scaffold
+    Useful for identifying true gene duplication events, gene calling errors, or assembly errors. See note below.
+8. List indicating the position of each marker genes within a bin
+9. Marker genes identified in each bin and their sequence
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>