comparison recentrifuge.xml @ 0:09b7b0b2e2c2 draft

planemo upload for repository https://github.com/mesocentre-clermont-auvergne/galaxy-tools/tree/master/tools/recentrifuge commit fdcec50b71967011e4351eb347a9df2840be6bee
author iuc
date Mon, 27 Jun 2022 11:03:22 +0000
parents
children fe733f05c2f8
comparison
equal deleted inserted replaced
-1:000000000000 0:09b7b0b2e2c2
1 <?xml version="1.0" encoding="UTF-8"?>
2
3 <tool id="recentrifuge" name="Recentrifuge" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
4 <description>
5 Robust comparative analysis and contamination removal for metagenomics
6 </description>
7 <macros>
8 <import>macro.xml</import>
9 </macros>
10 <expand macro='xrefs'/>
11 <expand macro="requirements" />
12 <expand macro="version_command" />
13 <command detect_errors="aggressive"><![CDATA[
14 #*======================================
15 database input
16 ======================================*#
17 rcf
18 -n $database.database_name.fields.path
19 #*======================================
20 Recentrifuge input file
21 ======================================*#
22 #if $file_type.filetype == "centrifuge"
23 -f '$input_file'
24 #set $default_scoring = 'SHEL'
25 #else if $file_type.filetype == "lmat"
26 -l '$input_file'
27 #set $default_scoring = 'LMAT'
28 #else if $file_type.filetype == "clark"
29 -r '$input_file'
30 #set $default_scoring = 'SHEL'
31 #else if $file_type.filetype == "kraken"
32 -k '$input_file'
33 #set $default_scoring = 'KRAKEN'
34 #else if $file_type.filetype == "generic"
35 -g '$input_file'
36 --format '$file_type.format'
37 #set $default_scoring = 'GENERIC'
38 #end if
39 #*======================================
40 Output options
41 ======================================*#
42 -e $output_option.extra
43 -o output
44 $output_option.nohtml
45 #*======================================
46 Advanced options
47 ======================================*#
48 #if $advanced_option.controls
49 --controls '$advanced_option.controls'
50 #end if
51 #if $advanced_option.scoring
52 --scoring '$advanced_option.scoring'
53 #else
54 --scoring '$default_scoring'
55 #end if
56 #if $advanced_option.minscore_value
57 --minscore '$advanced_option.minscore_value'
58 #end if
59 #if $advanced_option.mintaxa
60 --mintaxa '$advanced_option.mintaxa'
61 #end if
62 #if $advanced_option.exclude_taxa_name
63 --exclude '$advanced_option.exclude_taxa_name'
64 #end if
65 #if $advanced_option.include_taxa_name
66 --include '$advanced_option.include_taxa_name'
67 #end if
68 $advanced_option.avoidcross
69 #*======================================
70 More advanced options
71 ======================================*#
72 #if $more_advanced_option.ctrlminscore
73 --ctrlminscore '$more_advanced_option.ctrlminscore'
74 #end if
75 #if $more_advanced_option.ctrlmintaxa
76 --ctrlmintaxa '$more_advanced_option.ctrlmintaxa'
77 #end if
78 --summary $more_advanced_option.summary
79 $more_advanced_option.takeoutroot
80 $more_advanced_option.nokollapse
81 $more_advanced_option.strain
82 $more_advanced_option.sequential
83 #*======================================
84 Log file output
85 ======================================*#
86 &> $logfile
87 ]]>
88 </command>
89 <inputs>
90 <!-- INPUT FILES -->
91 <param name="input_file" type="data" format="tabular" label="Select taxonomy file tabular formated"/>
92 <conditional name="file_type">
93 <param name="filetype" type="select" display="radio" label="Type of input file (Centrifuge, CLARK, Generic, Kraken, LMAT)" help="(-f, -r, -g, -k, -l)">
94 <option value="centrifuge">Centrifuge</option>
95 <option value="clark">CLARK</option>
96 <option value="generic">Generic</option>
97 <option value="lmat" >LMAT</option>
98 <option value="kraken" >Kraken</option>
99 </param>
100 <when value="centrifuge"/>
101 <when value="lmat"/>
102 <when value="clark"/>
103 <when value="kraken"/>
104 <when value="generic">
105 <param argument="--format" type="text" label="Format of the output files from a generic classifier"
106 help="string like 'TYP:csv,TID:1,LEN:3,SCO:6,UNC:0'
107 where valid file Types are csv/tsv/ssv, and the rest of fields indicate the number of column used (starting in 1)
108 for the TaxIDs assigned,the LENgth of the read, the SCOre given to the assignment">
109 </param>
110 </when>
111 </conditional>
112 <!-- taxa databases -->
113 <section name="database" title="Database type" expanded="true">
114 <param name="database_name" type="select" label="Cached database whith taxa ID">
115 <options from_data_table="ncbi_taxonomy">
116 <validator message="No NCBI database is available" type="no_options"/>
117 </options>
118 </param>
119 </section>
120 <!-- output name -->
121 <section name="output_option" title="Output options">
122 <param argument="--extra" type="select" label="Type of extra output to be generated (default on CSV)" help="Default : CSV">
123 <option value="CSV" selected="true" >CSV</option>
124 <option value="DYNOMICS">DYNOMICS</option>
125 <option value="FULL">FULL</option>
126 <option value="TSV" >TSV</option>
127 </param>
128 <param argument="--nohtml" type="boolean" truevalue="--nohtml" falsevalue="" label="Suppress saving the HTML output file" help="remove html output"/>
129 </section>
130 <!-- ADVANCED OPTIONS -->
131 <section name="advanced_option" title="Coarse tuning of algorithm parameters">
132 <param name="controls" type="integer" optional="true" value="0" label="Number of first samples will be treated as negative controls" help="Add control samples (default is 0)"/>
133 <param name="scoring" type="select" optional="true" label="Type of scoring to be applied" help="Scoring algorithm, depending of the input files, see the help section of the tools">
134 <option value="SHEL">SHEL</option>
135 <option value="LENGTH">LENGTH</option>
136 <option value="LOGLENGTH">LOGLENGTH</option>
137 <option value="NORMA">NORMA</option>
138 <option value="LMAT">LMAT (only for LMAT input)</option>
139 <option value="CLARK_C">CLARK_C (only for Clark input)</option>
140 <option value="CLARK_G">CLARK_G (only for Clark input)</option>
141 <option value="KRAKEN">KRAKEN (only for kraken input)</option>
142 <option value="GENERIC">GENERIC (only for generic input)</option>
143 </param>
144 <param name="minscore_value" type="integer" optional="true" min="0" value="0" label="Minimum score/confidence of the classification of a
145 read to pass the quality filter" help="All pass by default (--minscore)"/>
146 <param name="mintaxa" type="integer" optional="true" min="0" value="0" label="Minimum taxa to avoid collapsing one level into the
147 parent" help="If not specified a value will be automatically
148 assigned (--mintaxa)"/>
149 <param name="exclude_taxa_name" argument="--exclude" type="text" optional="true" label="NCBI taxid code to exclude a taxon and all underneath" help="Default: no exclude">
150 <sanitizer invalid_char="">
151 <valid initial="string.letters,string.digits">
152 <add value="," />
153 </valid>
154 </sanitizer>
155 <validator type="regex">[A-Za-z0-9,]+</validator>
156 </param>
157 <param name="include_taxa_name" argument="--include" type="text" optional="true" label="NCBI taxid code to include a taxon and all underneath" help="Default: all included">
158 <sanitizer invalid_char="">
159 <valid initial="string.letters,string.digits">
160 <add value="," />
161 </valid>
162 </sanitizer>
163 <validator type="regex">[A-Za-z0-9,]+</validator>
164 </param>
165 <param argument="--avoidcross" type="boolean" truevalue="--avoidcross" falsevalue="" label="Avoid cross analysis" help="Default: no"/>
166 </section>
167 <!-- Detailed more fine parameters -->
168 <section name="more_advanced_option" title=" Fine tuning of algorithm parameters">
169 <param name="ctrlminscore" argument="--ctrlminscore" type="integer" optional="true" value="0" label="Minimum score in control samples to pass the filter" help="Minimum score/confidence of the classification of a read in control samples to pass the quality filter; it defaults to minscore"/>
170 <param name="ctrlmintaxa" argument="--ctrlmintaxa" type="integer" optional="true" value="0" label="Minimum taxa to avoid collapsing one level into the parent" help="If not specified a value will be automatically assigned"/>
171 <param name="summary" argument="--summary" type="select" label="Add a summary of the analysis (default is Add)" help="Select to 'add' summary samples to other samples, or to 'only' show summary samples or to 'avoid' summaries at all">
172 <option value="ADD" selected="true">Add</option>
173 <option value="ONLY">Only</option>
174 <option value="AVOID">Avoid</option>
175 </param>
176 <param argument="--takeoutroot" type="boolean" truevalue="--takeoutroot" falsevalue="" label="Remove counts directly assigned to the root level"/>
177 <param argument="--nokollapse" type="boolean" truevalue="--nokollapse" falsevalue="" label="Show the cellular organisms taxon" />
178 <param argument="--strain" type="boolean" truevalue="--strain" falsevalue="" label="Strain level instead of species as the resolution limit for the robust contamination removal algorithm; use with caution, this is an experimental feature"/>
179 <param argument="--sequential" type="boolean" truevalue="--sequential" falsevalue="" label="Deactivate parallel processing"/>
180 </section>
181 </inputs>
182 <!-- OUTPUT FILE, TYPE DEPENDING ON extra PARAMETER -->
183 <outputs>
184 <data name="html_report" format="html" from_work_dir="output.rcf.html" label="${tool.name} on ${on_string}: html report">
185 <filter> output_option['nohtml'] == False</filter>
186 </data>
187 <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file"/>
188 <data name="data_csv" format="tabular" from_work_dir="output.rcf.data.csv" label="${tool.name} on ${on_string}: data.csv">
189 <filter> output_option['extra'] == 'CSV' </filter>
190 </data>
191 <data name="stat_csv" format="tabular" from_work_dir="output.rcf.stat.csv" label="${tool.name} on ${on_string}: stat.csv">
192 <filter> output_option['extra'] == 'CSV' </filter>
193 </data>
194 <data name="data_tsv" format="tabular" from_work_dir="output.rcf.data.tsv" label="${tool.name} on ${on_string}: data tsv">
195 <filter> output_option['extra'] == 'TSV' </filter>
196 </data>
197 <data name="stat_tsv" format="tabular" from_work_dir="output.rcf.stat.tsv" label="${tool.name} on ${on_string}: stat tsv">
198 <filter> output_option['extra'] == 'TSV' </filter>
199 </data>
200 <data name="xls_report" format="xlsx" from_work_dir="output.rcf.xlsx" label="${tool.name} on ${on_string}: xlsx report">
201 <filter> output_option['extra'] == 'FULL' or output_option['extra'] == 'DYNOMICS'</filter>
202 </data>
203 </outputs>
204 <tests>
205 <test expect_num_outputs="4"> <!-- kraken input and CSV output TEST_1-->
206 <section name="database">
207 <param name="database_name" value="test-db-2022"/>
208 </section>
209 <param name="input_file" value="kraken_test/kraken.out"/>
210 <conditional name="file_type">
211 <param name="filetype" value="kraken"/>
212 </conditional>
213 <section name="output_option">
214 <param name="output_type" value="default_type"/>
215 </section>
216 <section name="more_advanced_option">
217 <param name="summary" value="AVOID"/>
218 </section>
219 <output name="data_csv" file="kraken_test/test1_csv.rcf.data.csv" lines_diff="2"/>
220 <output name="stat_csv" file="kraken_test/test1_csv.rcf.stat.csv" lines_diff="2"/>
221 <output name="html_report" file="kraken_test/test1_csv.rcf.html" lines_diff="2"/>
222 <output name="logfile" file="kraken_test/test1_csv.log" lines_diff="12"/>
223 </test>
224 <test expect_num_outputs="3"> <!-- centrifuge input and full options with imported database TEST_2 -->
225 <section name="database">
226 <param name="database_name" value="test-db-2022"/>
227 </section>
228 <param name="input_file" value="centrifuge_test/centrifuge.out"/>
229 <conditional name="file_type">
230 <param name="filetype" value="centrifuge"/>
231 </conditional>
232 <section name="output_option">
233 <param name="extra" value="CSV"/>
234 <param name="nohtml" value="true"/>
235 </section>
236 <section name="advanced_option">
237 <param name="controls" value="0"/>
238 <param name="scoring" value="NORMA"/>
239 <param name="minscore_value" value="1"/>
240 <param name="avoidcross" value="true"/>
241 </section>
242 <section name="more_advanced_option">
243 <param name="ctrlminscore" value="1"/>
244 <param name="summary" value="AVOID"/>
245 </section>
246 <output name="stat_csv" file="centrifuge_test/test2_csv.rcf.stat.csv" lines_diff="2"/>
247 <output name="data_csv" file="centrifuge_test/test2_csv.rcf.data.csv" lines_diff="2"/>
248 <output name="logfile" file="centrifuge_test/test2_csv.log" lines_diff="12"/>
249 </test>
250 <test expect_num_outputs="3"> <!-- kraken input cached DB several option added TEST_3 -->
251 <section name="database">
252 <param name="database_name" value="test-db-2022"/>
253 </section>
254 <param name="input_file" value="kraken_test/kraken.out"/>
255 <conditional name="file_type">
256 <param name="filetype" value="kraken"/>
257 </conditional>
258 <section name="output_option" >
259 <param name="extra" value="TSV"/>
260 <param name="nohtml" value="true"/>
261 </section>
262 <section name="advanced_option">
263 <param name="scoring" value="LOGLENGTH"/>
264 </section>
265 <section name="more_advanced_option">
266 <param name="summary" value="ONLY"/>
267 <param name="strain" value="true"/>
268 </section>
269 <output name="data_tsv" file="kraken_test/test3_rcf.data.tsv" lines_diff="2"/>
270 <output name="stat_tsv" file="kraken_test/test3_rcf.stat.tsv" lines_diff="2"/>
271 <output name="logfile" file="kraken_test/test3_tsv.log" lines_diff="12"/>
272 </test>
273 </tests>
274 <help><![CDATA[
275 **What it does**
276
277 With Recentrifuge, researchers can interactively explore what organisms are in their samples and at which level of confidence, enabling robust comparative analysis of multiple samples in any metagenomic study.
278
279 * Removes diverse contaminants, including crossovers, using a novel robust contamination removal algorithm.
280 * Provides a confidence level for every result, since the calculated score propagates to all the downstream analysis and comparisons.
281 * Unveils the generalities and specificities in the metagenomic samples, thanks to a new comparative analysis engine.
282
283
284 Recentrifuge is especially useful when a more reliable detection of minority organisms is needed (e.g. in the case of low microbial biomass metagenomic studies)
285 in clinical, environmental, or forensic analysis. Beyond the standard confidence levels, Recentrifuge implements others devoted to variable length reads,
286 very convenient for complex datasets generated by nanopore sequencers.
287
288 **Input option**
289 Recentrifuge can deal with some different taxonomic output files.
290 Input files can come from centrifuge, kraken, clark of lmat software.
291 A generic fonction to accept other files is available but need to add information of the file content.
292 If generic is choose, the option format need a string like : 'TYP:csv,TID:1,LEN:3,SCO:6,UNC:0'.
293 Where TYP are csv/tsv/ssv, and the rest of fields indicate the number of column used (starting in 1)
294 for the TaxIDs assigned,the LENgth of the read, the SCOre given to the assignment"
295
296
297 **Database for recentrifuge**
298 Recentrifuge first need the taxonomic database from NCBI (nodes.dmp and names.dmp).
299 We also provide the option to directly load necessary files from history as a dataset list.
300 1. cached for already installed taxonomic databases
301 2. history to load from your history
302
303
304 **Output options**
305 1. Depending of the option provided, the file output format can be csv, tsv or xlsx and be combine in one or more files (extra).
306 3. By default a html file is generated to visualize data, could be remove using the nohtml option
307
308
309 **Advanced options**
310 1. Recentrifuge can integrate sample in the data which are negative control to normalize the data
311 2. Scoring is an option to choose the score method for the read classified by taxonomic tools :
312 SHEL (Single Hit Equivalent Length): This is a score value in pair bases roughly equivalent to a single hit to the database.
313 KRAKEN: This scoring scheme is only available for this classifier. It divides the k-mer hit count of the top assignment by the total k-mers in the read and multiplies the result by 100 to give a percentage of coverage (the fraction of the read k-mers covered by k-mers belonging to the read final assignment). This is the default scoring scheme for Kraken samples, and it supports the mixing of samples with different read length.
314 LENGTH: The score of a read will be its length (or the combined length of mate pairs).
315 LOGLENGTH: Logarithm (base 10) of the length score.
316 NORMA: This score is the normalized score SHEL / LENGTH in percentage, so it takes into account both the assignment quality and the length of the read. Very useful when both the score assignments and lengths are variable among the reads.
317 LMAT: This scoring scheme is only available for this classifier.
318 CLARK_C: This scoring scheme is not available for other classifiers. It takes the confidence score as the score for a read, conf=h1/(h1+h2), or 1-conf=h2/(h1+h2) in case the majority of a read is not classified (1st assignment unclassified). See CLARK's README file for details on how h1 and h2 are calculated. If you use this scoring, you will probably want to filter to a minimum of 0.5 (-y 0.5) or beyond, as under 0.5 the assignments have very low confidence.
319 CLARK_G: This scheme scores every read with its CLARK gamma score, so it is only available for this classifier.
320 3. You can choose a filter for read quality using the minscore option (--minscore)
321 4. You can include or exclude specific taxa using the NCBI taxid code
322
323
324 **More advanced options**
325 1. You can choose a filter for read quality specifically on the control samples
326 2. You cans specify the minimum taxa value to avoid collapsing one level into parent
327 3. A summary option is available produce a summary file
328 Some other options are available and explicite in the more advanced panel of the tool
329
330
331 rcf - Release 1.8.1 - Mar 2022
332
333 Copyright (C) 2017–2022, Jose Manuel Martí Martínez
334
335 This program is free software: you can redistribute it and/or modify
336 it under the terms of the GNU Affero General Public License as
337 published by the Free Software Foundation, either version 3 of the
338 License, or (at your option) any later version.
339
340 This program is distributed in the hope that it will be useful,
341 but WITHOUT ANY WARRANTY; without even the implied warranty of
342 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
343 GNU Affero General Public License for more details.
344
345 You should have received a copy of the GNU Affero General Public License
346 along with this program. If not, see <https://www.gnu.org/licenses/>.
347 ]]></help>
348 <expand macro="citations"/>
349 </tool>