Mercurial > repos > iuc > checkm_qa
diff qa.xml @ 0:5c0493cdced9 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/checkm commit 2a3b068a98bf0e913dc03e0d5c2182cfd102cf27
author | iuc |
---|---|
date | Fri, 29 Jul 2022 20:34:59 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa.xml Fri Jul 29 20:34:59 2022 +0000 @@ -0,0 +1,500 @@ +<tool id="checkm_qa" name="CheckM qa" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description> + Assess bins for contamination and completeness + </description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="biotools"/> + <expand macro="requirements"/> + <expand macro="version"/> + <command detect_errors="exit_code"><![CDATA[ +#import re +mkdir -p 'output/storage/' && +cp '$checkm_hmm_info' 'output/storage/checkm_hmm_info.pkl.gz' && +cp '$bin_stats_analyze' 'output/storage/bin_stats.analyze.tsv' && +#for $i in $hmmer_analyze + #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier)) +mkdir -p 'output/bins/${identifier}' && +cp '$i' 'output/bins/${identifier}/hmmer.analyze.txt' && +#end for +#if $output.out_format == '9' + #for $i in $output.genes_faa + #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier)) +mkdir -p 'output/bins/${identifier}' && +cp '$i' 'output/bins/${identifier}/genes.faa' && + #end for +#end if + +checkm qa + '$marker_file' + 'output' + --out_format $output.out_format + --tab_table + --file 'output_file' +#if $exclude_markers + --exclude_markers '$exclude_markers' +#end if + $individual_markers + $skip_adj_correction + $skip_pseudogene_correction + --aai_strain $aai_strain + $ignore_thresholds + --e_value $e_value + --length $length +#if $coverage + --coverage_file '$coverage' +#end if + --threads \${GALAXY_SLOTS:-1} + ]]></command> + <inputs> + <expand macro="marker_file" /> + <param name="checkm_hmm_info" type="data" format="zip" label="Marker gene HMM info for each bin" help="Output of the CheckM analyze tool" /> + <param name="bin_stats_analyze" type="data" format="tabular" label="Marker gene bin stats" help="Output of the CheckM analyze tool" /> + <param name="hmmer_analyze" type="data_collection" collection_type="list" format="txt" label="Marker gene HMM hits to each bin" help="Output of the CheckM analyze tool" /> + <conditional name="output"> + <param argument="--out_format" type="select" label="Desired output"> + <option value="1">Summary of bin completeness and contamination</option> + <option value="2">Extended summary of bin statistics (includes GC, genome size, ...)</option> + <option value="3">Summary of bin quality for increasingly basal lineage-specific marker sets</option> + <option value="4">List of marker genes and their counts</option> + <option value="5">List of bin id, marker gene id, gene id</option> + <option value="6">List of marker genes present multiple times in a bin</option> + <option value="7">List of marker genes present multiple times on the same scaffold</option> + <option value="8">List indicating position of each marker gene within a bin</option> + <option value="9">Marker genes identified in each bin and their sequence</option> + </param> + <when value="1"/> + <when value="2"/> + <when value="3"/> + <when value="4"/> + <when value="5"/> + <when value="6"/> + <when value="7"/> + <when value="8"/> + <when value="9"> + <param name="genes_faa" type="data_collection" collection_type="list" label="Nucleotide gene sequences for each bin"/> + </when> + </conditional> + <param argument="exclude_markers" type="data" format="txt" optional="true" label="Markers to exclude from marker sets" /> + <expand macro="qa_params" /> + <param argument="coverage" type="data" format="txt" optional="true" label="Coverage of each sequence" help="Generated by the coverage command" /> + <param name="extra_outputs" type="select" multiple="true" optional="true" label="Extra outputs"> + <expand macro="qa_extra_output_options" /> + </param> + </inputs> + <outputs> + <data name="output_f1" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Summary of bin completeness and contamination"> + <filter>output['out_format']=="1"</filter> + </data> + <data name="output_f2" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Extended summary of bin statistics"> + <filter>output['out_format']=="2"</filter> + </data> + <data name="output_f3" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Summary of bin quality for increasingly basal lineage-specific marker sets"> + <filter>output['out_format']=="3"</filter> + </data> + <data name="output_f4" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes and their counts"> + <filter>output['out_format']=="4"</filter> + </data> + <data name="output_f5" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Bin id, marker gene id, gene id"> + <filter>output['out_format']=="5"</filter> + </data> + <data name="output_f6" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes present multiple times in a bin"> + <filter>output['out_format']=="6"</filter> + </data> + <data name="output_f7" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes present multiple times on the same scaffold"> + <filter>output['out_format']=="7"</filter> + </data> + <data name="output_f8" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Indicating position of each marker gene within a bin"> + <filter>output['out_format']=="8"</filter> + </data> + <data name="output_f9" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes identified in each bin and their sequence"> + <filter>output['out_format']=="9"</filter> + </data> + <data name="bin_stats_ext" format="tabular" from_work_dir="output/storage/bin_stats_ext.tsv" label="${tool.name} on ${on_string}: Marker gene bin extensive stats"/> + <expand macro="qa_extra_outputs" /> + </outputs> + <tests> + <test expect_num_outputs="3"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="1"/> + </conditional> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value="marker_gene_stats"/> + <output name="output_f1" ftype="tabular"> + <assert_contents> + <has_text text="Marker lineage"/> + <has_text text="637000110"/> + <has_text text="f__Enterobacteriaceae"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + <!--<output name="alignment_file" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="Lineage Marker File"/> + <has_text text="UID5139"/> + </assert_contents> + </output>--> + <output name="marker_gene_stats" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="AC_000091_751"/> + <has_text text="TIGR02432"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="2"/> + </conditional> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value=""/> + <output name="output_f2" ftype="tabular"> + <assert_contents> + <has_text text="Marker lineage"/> + <has_text text="Mean scaffold length"/> + <has_text text="f__Enterobacteriaceae"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="3"/> + </conditional> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value=""/> + <output name="output_f3" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="Strain heterogeneity"/> + <has_text text="UID5139"/> + <has_text text="p__Proteobacteria"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="4"/> + </conditional> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value=""/> + <output name="output_f4" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="Node Id: UID5103; Marker lineage: f__Enterobacteriaceae"/> + <has_text text="PF02542.1"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="5"/> + </conditional> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value=""/> + <output name="output_f5" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="TIGR02432"/> + <has_text text="AC_000091_165"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="6"/> + </conditional> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value=""/> + <output name="output_f6" ftype="tabular"> + <assert_contents> + <has_text text="Marker Id"/> + <has_text text="No marker genes satisfied"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="7"/> + </conditional> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value=""/> + <output name="output_f7" ftype="tabular"> + <assert_contents> + <has_text text="Marker Id"/> + <has_text text="No marker genes satisfied"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="8"/> + </conditional> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value=""/> + <output name="output_f8" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="AC_000091_183"/> + <has_text text="TIGR02075,9,240"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="marker_file" ftype="tabular" value="lineage_marker_set"/> + <param name="hmmer_analyze"> + <collection type="list"> + <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/> + </collection> + </param> + <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/> + <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/> + <conditional name="output"> + <param name="out_format" value="9"/> + <param name="genes_faa"> + <collection type="list"> + <element name="637000110" ftype="fasta" value="637000110.faa"/> + </collection> + </param> + </conditional> + <param name="exclude_markers" ftype="txt" value="markers_to_exclude" /> + <param name="individual_markers" value="false"/> + <param name="skip_adj_correction" value="false"/> + <param name="skip_pseudogene_correction" value="false"/> + <param name="aai_strain" value="0.9"/> + <param name="ignore_thresholds" value="false"/> + <param name="e_value" value="1e-10"/> + <param name="length" value="0.7"/> + <param name="extra_outputs" value=""/> + <output name="output_f9" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="Sequence"/> + <has_text text="PF06574.7"/> + <has_text text="MKLIRGI"/> + </assert_contents> + </output> + <output name="bin_stats_ext" ftype="tabular"> + <assert_contents> + <has_text text="637000110"/> + <has_text text="marker lineage"/> + <has_text text="GCN0"/> + <has_text text="Longest contig"/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +@HELP_HEADER@ + +This command identifies marker genes in bins and calculates genome statistics + +Adjacent called genes matching the same marker gene may indicate a true duplication event, a gene calling error, or an assembly error. If adjacent genes hit distinct regions of the same marker gene HMM, CheckM assumes a gene calling error has occurred and concatenate the two genes. When this occurs, CheckM concatenates the gene ids of the two genes with a pair of ampersands (&&). + +Outputs +======= +Output in function of selection output format + +1. Summary of bin completeness, contamination, and strain heterogeneity + Bin Id: bin identifier derived from input FASTA file + Marker lineage: indicates lineage used for inferring marker set (a precise indication of where a bin was placed in CheckM's reference tree can be obtained with the tree_qa command) + No. genomes: number of reference genomes used to infer marker set + No. markers: number of inferred marker genes + No. marker sets: number of inferred co-located marker sets + 0-5+: number of times each marker gene is identified + Completeness: estimated completeness + Contamination: estimated contamination + Strain heterogeneity: estimated strain heterogeneity +2. Extended summary of bin quality (includes GC, genome size, coding density, ...) +3. Summary of bin quality for increasingly basal lineage-specific marker sets + Node Id: unique id of internal node in genome tree from which lineage-specific markers were inferred +4. ist of marker genes for each bin along with the number of times each marker was identified + Node Id: unique id of internal node in genome tree from which lineage-specific markers were inferred + Marker lineage: indicates lineage used for inferring marker set + Useful for identifying lineage-specific gene loss or duplication +5. List of bin id, marker gene id, and called gene id for each identified marker gene +6. List of marker genes present multiple times in a bin +7. List of marker genes present multiple times on the same scaffold + Useful for identifying true gene duplication events, gene calling errors, or assembly errors. See note below. +8. List indicating the position of each marker genes within a bin +9. Marker genes identified in each bin and their sequence + + ]]></help> + <expand macro="citations"/> +</tool>