comparison end_to_end.xml @ 0:ace74c46b80f draft

planemo upload for repository https://github.com/Helmholtz-UFZ/ufz-galaxy-tools/blob/main/tools/checkv/ commit 625d1e8699c69e5ee3caef0cc5c883a9d9e6ac91
author ufz
date Mon, 16 Sep 2024 09:54:01 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ace74c46b80f
1 <tool id="checkv_end_to_end" name="CheckV end to end" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
2 <description></description>
3 <macros>
4 <token name="@TOOL_VERSION@">1.0.3</token>
5 <token name="@VERSION_SUFFIX@">0</token>
6 </macros>
7 <xrefs>
8 <xref type="bio.tools">checkv</xref>
9 </xrefs>
10 <requirements>
11 <requirement type="package" version="@TOOL_VERSION@">checkv</requirement>
12 </requirements>
13 <command detect_errors="exit_code"><![CDATA[
14 checkv end_to_end
15 '$input'
16 output
17 -d '$reference.fields.path'
18 --remove_tmp
19 -t "\${GALAXY_SLOTS:-1}"
20 ]]></command>
21 <inputs>
22 <param name="input" type="data" format="fasta,fasta.gz,fasta.bz2" label="Input nucleotide sequences in FASTA format"/>
23 <param name="reference" type="select" label="CheckV reference data">
24 <options from_data_table="checkv">
25 <validator type="no_options" message="No reference data available. Contact your Galaxy admin"/>
26 </options>
27 </param>
28 <param name="optional_outputs" type="select" optional="true" multiple="true" label="Optional outputs">
29 <option value="completeness">Overview of how completeness was estimated</option>
30 <option value="contamination"></option>
31 </param>
32 </inputs>
33 <outputs>
34 <data name="quality_summary" format="tabular" from_work_dir="output/quality_summary.tsv" label="${tool.name} on ${on_string}: Quality summary"/>
35 <data name="completeness" format="tabular" from_work_dir="output/completeness.tsv" label="${tool.name} on ${on_string}: Completeness">
36 <filter>optional_outputs and "completeness" in optional_outputs</filter>
37 </data>
38 <data name="contamination" format="tabular" from_work_dir="output/contamination.tsv" label="${tool.name} on ${on_string}: Contamination">
39 <filter>optional_outputs and "contamination" in optional_outputs</filter>
40 </data>
41 <data name="complete_genomes" format="tabular" from_work_dir="output/complete_genomes.tsv" label="${tool.name} on ${on_string}: Complete Genomes"/>
42 <data name="proviruses" format="fasta" from_work_dir="output/proviruses.fna" label="${tool.name} on ${on_string}: Proviruses"/>
43 <data name="viruses" format="fasta" from_work_dir="output/viruses.fna" label="${tool.name} on ${on_string}: Viruses"/>
44 </outputs>
45 <tests>
46 <!-- <test expect_num_outputs="4">
47 <param name="input" value="test_sequences.fna"/>
48 <param name="reference" value="1.5"/>
49 <output name="quality_summary">
50 <assert_contents>
51 <has_n_columns n="14"/>
52 <has_n_lines n="41"/>
53 </assert_contents>
54 </output>
55 <output name="complete_genomes">
56 <assert_contents>
57 <has_n_columns n="11"/>
58 <has_n_lines n="6"/>
59 </assert_contents>
60 </output>
61 <output name="viruses">
62 <assert_contents>
63 <has_line_matching expression="^>.*" n="39"/>
64 </assert_contents>
65 </output>
66 <output name="proviruses">
67 <assert_contents>
68 <has_line_matching expression="^>.*" n="1"/>
69 </assert_contents>
70 </output>
71 </test>
72 <test expect_num_outputs="6">
73 <param name="input" value="test_sequences.fna"/>
74 <param name="reference" value="1.5"/>
75 <param name="optional_outputs" value="completeness,contamination"/>
76 <output name="quality_summary">
77 <assert_contents>
78 <has_n_columns n="14"/>
79 <has_n_lines n="41"/>
80 </assert_contents>
81 </output>
82 <output name="completeness">
83 <assert_contents>
84 <has_n_columns n="15"/>
85 <has_n_lines n="41"/>
86 </assert_contents>
87 </output>
88 <output name="contamination">
89 <assert_contents>
90 <has_n_columns n="14"/>
91 <has_n_lines n="41"/>
92 </assert_contents>
93 </output>
94 <output name="complete_genomes">
95 <assert_contents>
96 <has_n_columns n="11"/>
97 <has_n_lines n="6"/>
98 </assert_contents>
99 </output>
100 <output name="viruses">
101 <assert_contents>
102 <has_line_matching expression="^>.*" n="39"/>
103 </assert_contents>
104 </output>
105 <output name="proviruses">
106 <assert_contents>
107 <has_line_matching expression="^>.*" n="1"/>
108 </assert_contents>
109 </output>
110 </test> -->
111 </tests>
112 <help><![CDATA[
113
114 .. class:: infomark
115
116 **What it does**
117
118 CheckV is a fully automated command-line pipeline for assessing the quality of single-contig viral genomes, including identification of host contamination for integrated proviruses, estimating completeness for genome fragments, and identification of closed genomes.
119
120 There are 4 steps:
121
122 1. Remove host contamination on proviruses
123 - Genes are first annotated as viral or microbial based on comparison to a custom database of HMMs
124 - CheckV scans over the contig (5' to 3') comparing gene annotations and GC content between a pair of adjacent gene windows
125 - This information is used to compute a score at each intergenic position and identify host-virus breakpoints
126 - Works best for contigs that are mostly viral
127
128 2. Estimate genome completeness
129
130 - Proteins are first compared to the CheckV genome database using AAI (average amino acid identity)
131 - After identifying the top hits, completeness is computed as a ratio between the contig length (or viral region length for proviruses) and the length of matched reference
132 - A confidence level is reported based on the strength of the alignment
133 - Generally, high- and medium-confidence estimates are quite accurate
134 - Less frequently, your viral genome may not have a close match to the CheckV database; in these cases CheckV estimates the completeness based on the viral HMMs identified on the contig
135 - Based on the HMMs found, CheckV returns the estimated range for genome completeness (e.g. 35% to 60% completeness), which represents the 90% confidence interval based on the distribution of lengths of reference genomes with the same viral HMMs
136
137 3.: Predict closed genomes
138
139 - Direct terminal repeats (DTRs)
140 - Repeated sequence of >20-bp at start/end of contig
141 - Most trusted signature in our experience
142 - May indicate circular genome or linear genome replicated from a circular template (i.e. concatamer)
143 - Proviruses
144 - Viral region with predicted host boundaries at 5' and 3' ends (see panel A)
145 - Note: CheckV will not detect proviruses if host regions have already been removed (e.g. using VIBRANT or VirSorter)
146 - Inverted terminal repeats (ITRs)
147 - Repeated sequence of >20-bp at start/end of contig (3' repeat is inverted)
148 - Least trusted signature
149 - For all the methods above, CheckV also checks whether the contig is approximately the correct sequence length based on estimated completeness; this is important because terminal repeats can represent artifacts of metagenomic assembly
150
151 4. Summarize quality.
152
153 - Based on the results of 1-3, CheckV generates a report file and assigns query contigs to one of five quality tiers (consistent with and expand upon the MIUViG quality tiers):
154
155 - Complete
156 - High-quality (>90% completeness)
157 - Medium-quality (50-90% completeness)
158 - Low-quality (<50% completeness)
159 - Undetermined quality
160
161
162 Usage
163 .....
164
165
166 **Input**
167
168 - Viral contigs in fasta (or gz, bz2 compressed fasta).
169 - CheckV reference data
170
171 **Output**
172
173 - Quality Summary: Tabular file showing integrated results from the three main modules and should be the main output referred to.
174 - Complete genomes: Tabular overview of putative complete genomes identified.
175 - Viruses: Virus sequences
176 - Proviruses: Provirus sequences
177
178 Optional outputs:
179
180 - Completeness: detailed overview of how completeness was estimated
181 - Contamination: detailed overview of how contamination was estimated
182 ]]></help>
183 <citations>
184 <citation type="doi">10.1038/s41587-020-00774-7</citation>
185 </citations>
186 </tool>