Mercurial > repos > ufz > checkv_end_to_end

diff end_to_end.xml @ 0:ace74c46b80f draft
planemo upload for repository https://github.com/Helmholtz-UFZ/ufz-galaxy-tools/blob/main/tools/checkv/ commit 625d1e8699c69e5ee3caef0cc5c883a9d9e6ac91
author: ufz
date: Mon, 16 Sep 2024 09:54:01 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/end_to_end.xml	Mon Sep 16 09:54:01 2024 +0000
@@ -0,0 +1,186 @@
+<tool id="checkv_end_to_end" name="CheckV end to end" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description></description>
+    <macros>
+        <token name="@TOOL_VERSION@">1.0.3</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">checkv</xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">checkv</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        checkv end_to_end
+            '$input'
+            output
+            -d '$reference.fields.path'
+            --remove_tmp
+            -t "\${GALAXY_SLOTS:-1}"
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="fasta,fasta.gz,fasta.bz2" label="Input nucleotide sequences in FASTA format"/>
+        <param name="reference" type="select" label="CheckV reference data">
+            <options from_data_table="checkv">
+                <validator type="no_options" message="No reference data available. Contact your Galaxy admin"/>
+            </options>
+        </param>
+        <param name="optional_outputs" type="select" optional="true" multiple="true" label="Optional outputs">
+            <option value="completeness">Overview of how completeness was estimated</option>
+            <option value="contamination"></option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="quality_summary" format="tabular" from_work_dir="output/quality_summary.tsv" label="${tool.name} on ${on_string}: Quality summary"/>
+        <data name="completeness" format="tabular" from_work_dir="output/completeness.tsv" label="${tool.name} on ${on_string}: Completeness">
+            <filter>optional_outputs and "completeness" in optional_outputs</filter>
+        </data>
+        <data name="contamination" format="tabular" from_work_dir="output/contamination.tsv" label="${tool.name} on ${on_string}: Contamination">
+            <filter>optional_outputs and "contamination" in optional_outputs</filter>
+        </data>
+        <data name="complete_genomes" format="tabular" from_work_dir="output/complete_genomes.tsv" label="${tool.name} on ${on_string}: Complete Genomes"/>
+        <data name="proviruses" format="fasta" from_work_dir="output/proviruses.fna" label="${tool.name} on ${on_string}: Proviruses"/>
+        <data name="viruses" format="fasta" from_work_dir="output/viruses.fna" label="${tool.name} on ${on_string}: Viruses"/>
+    </outputs>
+    <tests>
+        <!-- <test expect_num_outputs="4">
+            <param name="input" value="test_sequences.fna"/>
+            <param name="reference" value="1.5"/>
+            <output name="quality_summary">
+                <assert_contents>
+                    <has_n_columns n="14"/>
+                    <has_n_lines n="41"/>
+                </assert_contents>
+            </output>
+            <output name="complete_genomes">
+                <assert_contents>
+                    <has_n_columns n="11"/>
+                    <has_n_lines n="6"/>
+                </assert_contents>
+            </output>
+            <output name="viruses">
+                <assert_contents>
+                    <has_line_matching expression="^>.*" n="39"/>
+                </assert_contents>
+            </output>
+            <output name="proviruses">
+                <assert_contents>
+                    <has_line_matching expression="^>.*" n="1"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="6">
+            <param name="input" value="test_sequences.fna"/>
+            <param name="reference" value="1.5"/>
+            <param name="optional_outputs" value="completeness,contamination"/>
+            <output name="quality_summary">
+                <assert_contents>
+                    <has_n_columns n="14"/>
+                    <has_n_lines n="41"/>
+                </assert_contents>
+            </output>
+            <output name="completeness">
+                <assert_contents>
+                    <has_n_columns n="15"/>
+                    <has_n_lines n="41"/>
+                </assert_contents>
+            </output>
+            <output name="contamination">
+                <assert_contents>
+                    <has_n_columns n="14"/>
+                    <has_n_lines n="41"/>
+                </assert_contents>
+            </output>
+            <output name="complete_genomes">
+                <assert_contents>
+                    <has_n_columns n="11"/>
+                    <has_n_lines n="6"/>
+                </assert_contents>
+            </output>
+            <output name="viruses">
+                <assert_contents>
+                    <has_line_matching expression="^>.*" n="39"/>
+                </assert_contents>
+            </output>
+            <output name="proviruses">
+                <assert_contents>
+                    <has_line_matching expression="^>.*" n="1"/>
+                </assert_contents>
+            </output>
+        </test> -->
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+CheckV is a fully automated command-line pipeline for assessing the quality of single-contig viral genomes, including identification of host contamination for integrated proviruses, estimating completeness for genome fragments, and identification of closed genomes.
+
+There are 4 steps:
+
+1. Remove host contamination on proviruses
+   - Genes are first annotated as viral or microbial based on comparison to a custom database of HMMs
+   -  CheckV scans over the contig (5' to 3') comparing gene annotations and GC content between a pair of adjacent gene windows
+   -  This information is used to compute a score at each intergenic position and identify host-virus breakpoints
+   -  Works best for contigs that are mostly viral
+
+2. Estimate genome completeness
+
+   - Proteins are first compared to the CheckV genome database using AAI (average amino acid identity)
+   - After identifying the top hits, completeness is computed as a ratio between the contig length (or viral region length for proviruses) and the length of matched reference
+   - A confidence level is reported based on the strength of the alignment
+   - Generally, high- and medium-confidence estimates are quite accurate
+   - Less frequently, your viral genome may not have a close match to the CheckV database; in these cases CheckV estimates the completeness based on the viral HMMs identified on the contig
+   - Based on the HMMs found, CheckV returns the estimated range for genome completeness (e.g. 35% to 60% completeness), which represents the 90% confidence interval based on the distribution of lengths of reference genomes with the same viral HMMs
+
+3.: Predict closed genomes
+
+    - Direct terminal repeats (DTRs)
+        - Repeated sequence of >20-bp at start/end of contig
+        - Most trusted signature in our experience
+        - May indicate circular genome or linear genome replicated from a circular template (i.e. concatamer)
+    - Proviruses
+        - Viral region with predicted host boundaries at 5' and 3' ends (see panel A)
+        - Note: CheckV will not detect proviruses if host regions have already been removed (e.g. using VIBRANT or VirSorter)
+    - Inverted terminal repeats (ITRs)
+        - Repeated sequence of >20-bp at start/end of contig (3' repeat is inverted)
+        - Least trusted signature
+    - For all the methods above, CheckV also checks whether the contig is approximately the correct sequence length based on estimated completeness; this is important because terminal repeats can represent artifacts of metagenomic assembly
+
+4. Summarize quality.
+
+   - Based on the results of 1-3, CheckV generates a report file and assigns query contigs to one of five quality tiers (consistent with and expand upon the MIUViG quality tiers):
+
+     - Complete
+     - High-quality (>90% completeness)
+     - Medium-quality (50-90% completeness)
+     - Low-quality (<50% completeness)
+     - Undetermined quality
+
+
+Usage
+.....
+
+
+**Input**
+
+- Viral contigs in fasta (or gz, bz2 compressed fasta).
+- CheckV reference data
+
+**Output**
+
+- Quality Summary: Tabular file showing integrated results from the three main modules and should be the main output referred to.
+- Complete genomes: Tabular overview of putative complete genomes identified.
+- Viruses: Virus sequences
+- Proviruses: Provirus sequences
+
+Optional outputs:
+
+- Completeness: detailed overview of how completeness was estimated
+- Contamination:  detailed overview of how contamination was estimated
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1038/s41587-020-00774-7</citation>
+    </citations>
+</tool>
\ No newline at end of file