Mercurial > repos > ufz > checkv_end_to_end
comparison end_to_end.xml @ 0:ace74c46b80f draft
planemo upload for repository https://github.com/Helmholtz-UFZ/ufz-galaxy-tools/blob/main/tools/checkv/ commit 625d1e8699c69e5ee3caef0cc5c883a9d9e6ac91
author | ufz |
---|---|
date | Mon, 16 Sep 2024 09:54:01 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ace74c46b80f |
---|---|
1 <tool id="checkv_end_to_end" name="CheckV end to end" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT"> | |
2 <description></description> | |
3 <macros> | |
4 <token name="@TOOL_VERSION@">1.0.3</token> | |
5 <token name="@VERSION_SUFFIX@">0</token> | |
6 </macros> | |
7 <xrefs> | |
8 <xref type="bio.tools">checkv</xref> | |
9 </xrefs> | |
10 <requirements> | |
11 <requirement type="package" version="@TOOL_VERSION@">checkv</requirement> | |
12 </requirements> | |
13 <command detect_errors="exit_code"><![CDATA[ | |
14 checkv end_to_end | |
15 '$input' | |
16 output | |
17 -d '$reference.fields.path' | |
18 --remove_tmp | |
19 -t "\${GALAXY_SLOTS:-1}" | |
20 ]]></command> | |
21 <inputs> | |
22 <param name="input" type="data" format="fasta,fasta.gz,fasta.bz2" label="Input nucleotide sequences in FASTA format"/> | |
23 <param name="reference" type="select" label="CheckV reference data"> | |
24 <options from_data_table="checkv"> | |
25 <validator type="no_options" message="No reference data available. Contact your Galaxy admin"/> | |
26 </options> | |
27 </param> | |
28 <param name="optional_outputs" type="select" optional="true" multiple="true" label="Optional outputs"> | |
29 <option value="completeness">Overview of how completeness was estimated</option> | |
30 <option value="contamination"></option> | |
31 </param> | |
32 </inputs> | |
33 <outputs> | |
34 <data name="quality_summary" format="tabular" from_work_dir="output/quality_summary.tsv" label="${tool.name} on ${on_string}: Quality summary"/> | |
35 <data name="completeness" format="tabular" from_work_dir="output/completeness.tsv" label="${tool.name} on ${on_string}: Completeness"> | |
36 <filter>optional_outputs and "completeness" in optional_outputs</filter> | |
37 </data> | |
38 <data name="contamination" format="tabular" from_work_dir="output/contamination.tsv" label="${tool.name} on ${on_string}: Contamination"> | |
39 <filter>optional_outputs and "contamination" in optional_outputs</filter> | |
40 </data> | |
41 <data name="complete_genomes" format="tabular" from_work_dir="output/complete_genomes.tsv" label="${tool.name} on ${on_string}: Complete Genomes"/> | |
42 <data name="proviruses" format="fasta" from_work_dir="output/proviruses.fna" label="${tool.name} on ${on_string}: Proviruses"/> | |
43 <data name="viruses" format="fasta" from_work_dir="output/viruses.fna" label="${tool.name} on ${on_string}: Viruses"/> | |
44 </outputs> | |
45 <tests> | |
46 <!-- <test expect_num_outputs="4"> | |
47 <param name="input" value="test_sequences.fna"/> | |
48 <param name="reference" value="1.5"/> | |
49 <output name="quality_summary"> | |
50 <assert_contents> | |
51 <has_n_columns n="14"/> | |
52 <has_n_lines n="41"/> | |
53 </assert_contents> | |
54 </output> | |
55 <output name="complete_genomes"> | |
56 <assert_contents> | |
57 <has_n_columns n="11"/> | |
58 <has_n_lines n="6"/> | |
59 </assert_contents> | |
60 </output> | |
61 <output name="viruses"> | |
62 <assert_contents> | |
63 <has_line_matching expression="^>.*" n="39"/> | |
64 </assert_contents> | |
65 </output> | |
66 <output name="proviruses"> | |
67 <assert_contents> | |
68 <has_line_matching expression="^>.*" n="1"/> | |
69 </assert_contents> | |
70 </output> | |
71 </test> | |
72 <test expect_num_outputs="6"> | |
73 <param name="input" value="test_sequences.fna"/> | |
74 <param name="reference" value="1.5"/> | |
75 <param name="optional_outputs" value="completeness,contamination"/> | |
76 <output name="quality_summary"> | |
77 <assert_contents> | |
78 <has_n_columns n="14"/> | |
79 <has_n_lines n="41"/> | |
80 </assert_contents> | |
81 </output> | |
82 <output name="completeness"> | |
83 <assert_contents> | |
84 <has_n_columns n="15"/> | |
85 <has_n_lines n="41"/> | |
86 </assert_contents> | |
87 </output> | |
88 <output name="contamination"> | |
89 <assert_contents> | |
90 <has_n_columns n="14"/> | |
91 <has_n_lines n="41"/> | |
92 </assert_contents> | |
93 </output> | |
94 <output name="complete_genomes"> | |
95 <assert_contents> | |
96 <has_n_columns n="11"/> | |
97 <has_n_lines n="6"/> | |
98 </assert_contents> | |
99 </output> | |
100 <output name="viruses"> | |
101 <assert_contents> | |
102 <has_line_matching expression="^>.*" n="39"/> | |
103 </assert_contents> | |
104 </output> | |
105 <output name="proviruses"> | |
106 <assert_contents> | |
107 <has_line_matching expression="^>.*" n="1"/> | |
108 </assert_contents> | |
109 </output> | |
110 </test> --> | |
111 </tests> | |
112 <help><![CDATA[ | |
113 | |
114 .. class:: infomark | |
115 | |
116 **What it does** | |
117 | |
118 CheckV is a fully automated command-line pipeline for assessing the quality of single-contig viral genomes, including identification of host contamination for integrated proviruses, estimating completeness for genome fragments, and identification of closed genomes. | |
119 | |
120 There are 4 steps: | |
121 | |
122 1. Remove host contamination on proviruses | |
123 - Genes are first annotated as viral or microbial based on comparison to a custom database of HMMs | |
124 - CheckV scans over the contig (5' to 3') comparing gene annotations and GC content between a pair of adjacent gene windows | |
125 - This information is used to compute a score at each intergenic position and identify host-virus breakpoints | |
126 - Works best for contigs that are mostly viral | |
127 | |
128 2. Estimate genome completeness | |
129 | |
130 - Proteins are first compared to the CheckV genome database using AAI (average amino acid identity) | |
131 - After identifying the top hits, completeness is computed as a ratio between the contig length (or viral region length for proviruses) and the length of matched reference | |
132 - A confidence level is reported based on the strength of the alignment | |
133 - Generally, high- and medium-confidence estimates are quite accurate | |
134 - Less frequently, your viral genome may not have a close match to the CheckV database; in these cases CheckV estimates the completeness based on the viral HMMs identified on the contig | |
135 - Based on the HMMs found, CheckV returns the estimated range for genome completeness (e.g. 35% to 60% completeness), which represents the 90% confidence interval based on the distribution of lengths of reference genomes with the same viral HMMs | |
136 | |
137 3.: Predict closed genomes | |
138 | |
139 - Direct terminal repeats (DTRs) | |
140 - Repeated sequence of >20-bp at start/end of contig | |
141 - Most trusted signature in our experience | |
142 - May indicate circular genome or linear genome replicated from a circular template (i.e. concatamer) | |
143 - Proviruses | |
144 - Viral region with predicted host boundaries at 5' and 3' ends (see panel A) | |
145 - Note: CheckV will not detect proviruses if host regions have already been removed (e.g. using VIBRANT or VirSorter) | |
146 - Inverted terminal repeats (ITRs) | |
147 - Repeated sequence of >20-bp at start/end of contig (3' repeat is inverted) | |
148 - Least trusted signature | |
149 - For all the methods above, CheckV also checks whether the contig is approximately the correct sequence length based on estimated completeness; this is important because terminal repeats can represent artifacts of metagenomic assembly | |
150 | |
151 4. Summarize quality. | |
152 | |
153 - Based on the results of 1-3, CheckV generates a report file and assigns query contigs to one of five quality tiers (consistent with and expand upon the MIUViG quality tiers): | |
154 | |
155 - Complete | |
156 - High-quality (>90% completeness) | |
157 - Medium-quality (50-90% completeness) | |
158 - Low-quality (<50% completeness) | |
159 - Undetermined quality | |
160 | |
161 | |
162 Usage | |
163 ..... | |
164 | |
165 | |
166 **Input** | |
167 | |
168 - Viral contigs in fasta (or gz, bz2 compressed fasta). | |
169 - CheckV reference data | |
170 | |
171 **Output** | |
172 | |
173 - Quality Summary: Tabular file showing integrated results from the three main modules and should be the main output referred to. | |
174 - Complete genomes: Tabular overview of putative complete genomes identified. | |
175 - Viruses: Virus sequences | |
176 - Proviruses: Provirus sequences | |
177 | |
178 Optional outputs: | |
179 | |
180 - Completeness: detailed overview of how completeness was estimated | |
181 - Contamination: detailed overview of how contamination was estimated | |
182 ]]></help> | |
183 <citations> | |
184 <citation type="doi">10.1038/s41587-020-00774-7</citation> | |
185 </citations> | |
186 </tool> |