comparison preprocess.xml @ 0:76c750c5f0d1 draft default tip

planemo upload for repository https://github.com/oinizan/FROGS-wrappers commit 0b900a51e220ce6f17c1e76292c06a5f4d934055-dirty
author frogs
date Thu, 25 Oct 2018 05:01:13 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:76c750c5f0d1
1 <?xml version="1.0"?>
2 <!--
3 # Copyright (C) 2015 INRA
4 #
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
17 -->
18 <tool id="FROGS_preprocess" name="FROGS Pre-process" version="2.0.1">
19 <description>Step 1 in metagenomics analysis: denoising and dereplication.</description>
20 <requirements>
21 <requirement type="package" version="2.0.1">frogs</requirement>
22 </requirements>
23 <stdio>
24 <exit_code range="1:" />
25 <exit_code range=":-1" />
26 </stdio>
27 <command>
28 preprocess.py $sequencer_type.sequencer_selected
29 --output-dereplicated $dereplicated_file --output-count $count_file --summary $summary_file
30 --nb-cpus $nb_cpus
31 #if $sequencer_type.sequencer_selected == "illumina"
32 --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size
33 #if $sequencer_type.sequencing_protocol.sequencing_protocol_selected == "standard"
34 --five-prim-primer $sequencer_type.sequencing_protocol.five_prim_primer --three-prim-primer $sequencer_type.sequencing_protocol.three_prim_primer
35 #else
36 --without-primers
37 #end if
38 #else
39 --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size
40 --five-prim-primer $sequencer_type.five_prim_primer --three-prim-primer $sequencer_type.three_prim_primer
41 #end if
42
43 #if $sequencer_type.input_type.input_type_selected == "archive"
44 --input-archive $sequencer_type.input_type.archive_file
45 #if $sequencer_type.sequencer_selected == "illumina" and $sequencer_type.input_type.archive_type.archive_type_selected == "already_contiged"
46 --already-contiged
47 #elif $sequencer_type.sequencer_selected == "illumina"
48 --R1-size $sequencer_type.input_type.archive_type.R1_size --R2-size $sequencer_type.input_type.archive_type.R2_size
49 --expected-amplicon-size $sequencer_type.input_type.archive_type.expected_amplicon_size
50 --mismatch-rate $sequencer_type.input_type.archive_type.mm_rate
51 #end if
52 #else
53 #set $sep = ' '
54 #if $sequencer_type.sequencer_selected == "illumina"
55 --samples-names
56 #for $current in $sequencer_type.input_type.files_by_samples_type.samples
57 $sep"${current.name.strip()}"
58 #end for
59 --input-R1
60 #for $current in $sequencer_type.input_type.files_by_samples_type.samples
61 $sep${current.R1_file}
62 #end for
63 #if $sequencer_type.input_type.files_by_samples_type.files_by_samples_type_selected == "already_contiged"
64 --already-contiged
65 #else
66 --input-R2
67 #for $current in $sequencer_type.input_type.files_by_samples_type.samples
68 $sep${current.R2_file}
69 #end for
70 --R1-size $sequencer_type.input_type.files_by_samples_type.R1_size --R2-size $sequencer_type.input_type.files_by_samples_type.R2_size
71 --expected-amplicon-size $sequencer_type.input_type.files_by_samples_type.expected_amplicon_size
72 --mismatch-rate $sequencer_type.input_type.files_by_samples_type.mm_rate
73 #end if
74 #else
75 --input-R1
76 #for $current in $sequencer_type.input_type.samples
77 $sep${current.R1_file}
78 #end for
79 --samples-names
80 #for $current in $sequencer_type.input_type.samples
81 $sep"${current.name.strip()}"
82 #end for
83 #end if
84 #end if
85 </command>
86 <inputs>
87 <param name="nb_cpus" type="hidden" label="CPU number" help="The maximum number of CPUs used." value="1" />
88 <conditional name="sequencer_type">
89 <param name="sequencer_selected" type="select" label="Sequencer" help="Select the sequencing technology used to produce the sequences.">
90 <option value="illumina" selected="true">Illumina</option>
91 <option value="454">454</option>
92 </param>
93 <when value="illumina">
94 <!-- Samples -->
95 <conditional name="input_type">
96 <param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with two files (R1 and R2) by sample.">
97 <option value="files_by_samples" selected="true">Files by samples</option>
98 <option value="archive">Archive</option>
99 </param>
100 <when value="archive">
101 <param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file(s) for each sample." optional="false" />
102 <conditional name="archive_type">
103 <param name="archive_type_selected" type="select" label="Reads already contiged ?" help="The archive contains 1 file by sample : R1 and R2 are already merged by pair.">
104 <option value="paired" selected="true">No</option>
105 <option value="already_contiged">Yes</option>
106 </param>
107 <when value="paired">
108 <!-- Reads size -->
109 <param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" />
110 <param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" />
111 <param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" />
112 <param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatch in the overlap region" value="0.1" optional="false" />
113 </when>
114 <when value="already_contiged"></when>
115 </conditional>
116 </when>
117 <when value="files_by_samples">
118 <conditional name="files_by_samples_type">
119 <param name="files_by_samples_type_selected" type="select" label="Reads already contiged ?" help="The inputs contain 1 file by sample : R1 and R2 are already merged by pair.">
120 <option value="paired" selected="true">No</option>
121 <option value="already_contiged">Yes</option>
122 </param>
123 <when value="paired">
124 <!-- Samples -->
125 <repeat name="samples" title="Samples" min="1">
126 <param name="name" type="text" label="Name" help="The sample name." optional="false">
127 <validator type="empty_field" message="This parameter is required." />
128 </param>
129 <param format="fastq" name="R1_file" type="data" label="Reads 1" help="R1 FASTQ file of paired-end reads." />
130 <param format="fastq" name="R2_file" type="data" label="reads 2" help="R2 FASTQ file of paired-end reads." />
131 </repeat>
132 <!-- Reads size -->
133 <param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" />
134 <param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" />
135 <param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" />
136 <param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatches in the overlap region" value="0.1" optional="false" />
137 </when>
138 <when value="already_contiged">
139 <repeat name="samples" title="Samples" min="1">
140 <param name="name" type="text" label="Name" help="The sample name." optional="false">
141 <validator type="empty_field" message="This parameter is required." />
142 </param>
143 <param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of merged reads." />
144 </repeat>
145 </when>
146 </conditional>
147 </when>
148 </conditional>
149 <!-- Amplicons -->
150 <param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons." value="" optional="false" />
151 <param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons." value="" optional="false" />
152 <!-- Primers -->
153 <conditional name="sequencing_protocol">
154 <param name="sequencing_protocol_selected" type="select" label="Sequencing protocol" help="The protocol used for sequencing step: standard or custom with PCR primers as sequencing primers.">
155 <option value="standard" selected="true">Illumina standard</option>
156 <option value="without_primers">Custom protocol (Kozich et al. 2013)</option>
157 </param>
158 <when value="standard">
159 <param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
160 <validator type="empty_field" message="This parameter is required." />
161 </param>
162 <param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
163 <validator type="empty_field" message="This parameter is required." />
164 </param>
165 </when>
166 <when value="without_primers"></when>
167 </conditional>
168 </when>
169
170 <when value="454">
171 <!-- Samples -->
172 <conditional name="input_type">
173 <param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with one file by sample.">
174 <option value="files_by_samples" selected="true">One file by sample</option>
175 <option value="archive">Archive</option>
176 </param>
177 <when value="archive">
178 <param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file for each sample." optional="false" />
179 </when>
180 <when value="files_by_samples">
181 <repeat name="samples" title="Samples" min="1">
182 <param name="name" type="text" label="Name" help="The sample name." optional="false" />
183 <param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of sample." />
184 </repeat>
185 </when>
186 </conditional>
187 <!-- Amplicons -->
188 <param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons (with primers)." value="" optional="false" />
189 <param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons (with primers)." value="" optional="false" />
190 <!-- Primers -->
191 <param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
192 <validator type="empty_field" message="This parameter is required." />
193 </param>
194 <param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
195 <validator type="empty_field" message="This parameter is required." />
196 </param>
197 </when>
198 </conditional>
199 </inputs>
200 <outputs>
201 <data format="fasta" name="dereplicated_file" label="${tool.name}: dereplicated.fasta" from_work_dir="dereplicated.fasta" />
202 <data format="tabular" name="count_file" label="${tool.name}: count.tsv" from_work_dir="count.tsv" />
203 <data format="html" name="summary_file" label="${tool.name}: report.html" from_work_dir="report.html" />
204 </outputs>
205 <tests>
206 <test>
207 <conditional name="sequencer_type">
208 <param name="sequencer_selected" value="illumina"/>
209 <conditional name="input_type">
210 <param name="input_type_selected" value="archive"/>
211 <param name="archive_file" value="test_dataset.tar.gz"/>
212 <conditional name="archive_type">
213 <param name="archive_type_selected" value="paired"/>
214 <param name="R1_size" value="250"/>
215 <param name="R2_size" value="250"/>
216 <param name="expected_amplicon_size" value="420"/>
217 <param name="mm_rate" value="0.15"/>
218 </conditional>
219 </conditional>
220 <param name="min_amplicon_size" value="380"/>
221 <param name="max_amplicon_size" value="460"/>
222 <conditional name="sequencing_protocol">
223 <param name="sequencing_protocol_selected" value="standard"/>
224 <param name="five_prim_primer" value="GGCGVACGGGTGAGTAA"/>
225 <param name="three_prim_primer" value="GTGCCAGCNGCNGCGG"/>
226 </conditional>
227 </conditional>
228 <output name="dereplicated_file" file="references/01-prepro.fasta"/>
229 <output name="count_file" file="references/01-prepro.tsv"/>
230 <output name="summary_file" file="references/01-prepro.html" compare="sim_size" delta="0"/>
231 </test>
232 </tests>
233 <help>
234
235 .. image:: static/images/FROGS_logo.png
236 :height: 144
237 :width: 110
238
239
240 .. class:: infomark page-header h2
241
242 What it does
243
244 FROGS Pre-process filters and dereplicates amplicons for use in diversity analysis.
245
246
247 .. class:: infomark page-header h2
248
249 Inputs/Outputs
250
251 .. class:: h3
252
253 Inputs
254
255 Sample files added one after another or provide in an archive file (tar.gz).
256
257 .. container:: row
258
259 .. container:: col-md-6
260
261 **Illumina inputs**
262
263 :Usage: For samples sequenced in paired-end. The amplicon length must be inferior to the length of the R1 plus R2 length. R1 and R2 are merged by the common region.
264 :Files: One R1 and R2 by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_)
265 :Example: splA_R1.fastq.gz, splA_R2.fastq.gz, splB_R1.fastq.gz, splB_R2.fastq.gz
266
267 OR
268
269 :Usage: For samples sequenced in single-ends or when R1 and R2 reads are already merged.
270 :Files: One sequence file by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_).
271 :Example: splA.fastq.gz, splB.fastq.gz
272
273 .. container:: col-md-6
274
275 **454 inputs**
276
277 :Files: One sequence file by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_)
278 :Example: splA.fastq.gz, splB.fastq.gz
279
280 Remark: In an archive if you use R1 and R2 files they names must end with *_R1* and *_R2*. To upload an archive, see the "Upload archive" tool or if possible create symbolic link on your Galaxy account.
281
282 .. class:: h3
283
284 Outputs
285
286 **Sequence file** (dereplicated.fasta):
287
288 Only one file with all samples sequences (format `FASTA &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_). These sequences are dereplicated: strictly identical sequence are represented only one and the initial count is kept in count file.
289
290 **Count file** (count.tsv):
291
292 This file contains the count of all unique sequences in each sample (format `TSV &lt;https://en.wikipedia.org/wiki/Tab-separated_values&gt;`_).
293
294 **Summary file** (report.html):
295
296 This file reports the number of remaining sequences after each filter (format `HTML &lt;https://en.wikipedia.org/wiki/HTML&gt;`_).
297
298 .. image:: static/images/FROGS_preprocess_summary.png
299 :height: 355
300 :width: 676
301
302 It also presents the length distribution of the remaining sequences.
303
304 .. image:: static/images/FROGS_preprocess_lengthsSamples.png
305 :height: 350
306 :width: 676
307
308 .. class:: infomark page-header h2
309
310 How it works
311
312 .. csv-table::
313 :header: "Steps", "Illumina", "454"
314 :widths: 5, 150, 150
315 :class: table table-striped
316
317 "1", "For un-merged data: merges R1 and R2 with a maximum of M% mismatch in the overlaped region (`FLASh &lt;http://ccb.jhu.edu/software/FLASH/&gt;`_). By default M is set to 10%", "/"
318 "2", "Filters merged sequences on their length which must be range between 'Minimum amplicon size' and 'Maximum amplicon size'", "/"
319 "3", "If sequencing protocol is the illumina standard protocol : Removes sequences where the two primers are not present and then remove primers in the remaining sequence (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences", "Removes sequences where the two primers are not present, removes primers sequence and reverse complement the sequences on strand - (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences"
320 "4", "Filters sequences on their length and with ambiguous nucleotides", "the tool removes sequences with at least one homopolymer with more than seven nucleotides and with a distance of less than or equal to 10 nucleo-tides between two poor quality positions, i.e. with a Phred quality score lesser than 10"
321 "5", "Dereplicates sequences", "Dereplicates sequences"
322
323
324 .. class:: infomark page-header h2
325
326 Advices/details on parameters
327
328 .. class:: h3
329
330 Primers parameters
331
332 The (`Kozich et al. 2013 &lt;http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3753973/&gt;`_ ) protocol uses custom sequencing primers which are also the PCR primers. In this case the reads do not contain the PCR primers.
333
334 In case of Illumina standard protocol, the primers must be provided in 5' to 3' orientation.
335
336 .. role:: alert-info
337
338 Example:
339
340 5' :alert-info:`ATGCCC` GTCGTCGTAAAATGC :alert-info:`ATTTCAG` 3'
341
342 Value for parameter 5' primer: ATGCC
343
344 Value for parameter 3' primer: ATTTCAG
345
346 .. class:: h3
347
348 Amplicons sizes parameters
349
350 The two following images show two examples of perfect values fors sizes parameters.
351
352 .. image:: static/images/FROGS_preprocess_ampliconSize_unimodal.png
353 :height: 415
354 :width: 676
355
356 .. image:: static/images/FROGS_preprocess_ampliconSize_multimodal.png
357 :height: 415
358 :width: 676
359
360 Don't worry the "Expected amplicon size" does not need to be very accurate.
361
362 .. class:: h3
363
364 If the filter 'overlapped' reduce drasticaly the number of sequences:
365
366 In un-merged Illumina data, the reduction of dataset by the overlapped filter is classicaly inferior than 20%. A loss of more than 20% in all samples can highlight a quality problem.
367
368 If the overlap between R1 and R2 is superior to 50 nucleotides and the quality of the end of the sequences is poor (see `FastQC &lt;http://www.bioinformatics.babraham.ac.uk/projects/fastqc/&gt;`_) you can try to cut the end of your sequences and relaunch the preprocess tool.
369 You can either raise the mismatch percent in the overlapped region, but not too much!
370
371 ----
372
373 **Contact**
374
375 Contacts: frogs@inra.fr
376
377 Repository: https://github.com/geraldinepascal/FROGS
378
379 Please cite the FROGS Publication: *Frederic Escudie, Lucas Auer, Maria Bernard, Mahendra Mariadassou, Laurent Cauquil, Katia Vidal, Sarah Maman, Guillermina Hernandez-Raquet, Sylvie Combes, Geraldine Pascal; FROGS: Find, Rapidly, OTUs with Galaxy Solution, Bioinformatics, , btx791,* https://doi.org/10.1093/bioinformatics/btx791
380
381 Depending on the help provided you can cite us in acknowledgements, references or both.
382 </help>
383 <citations>
384 <citation type="doi">10.1093/bioinformatics/btx791</citation>
385 <citation type="doi">10.1128/AEM.01043-13</citation>
386 <citation type="doi">10.14806/ej.17.1.200</citation>
387 <citation type="doi">10.1093/bioinformatics/btr507</citation>
388 </citations>
389 </tool>