view preprocess.xml @ 0:76c750c5f0d1 draft default tip

planemo upload for repository https://github.com/oinizan/FROGS-wrappers commit 0b900a51e220ce6f17c1e76292c06a5f4d934055-dirty
author frogs
date Thu, 25 Oct 2018 05:01:13 -0400
parents
children
line wrap: on
line source

<?xml version="1.0"?>
<!--
# Copyright (C) 2015 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-->
<tool id="FROGS_preprocess" name="FROGS Pre-process" version="2.0.1">
        <description>Step 1 in metagenomics analysis: denoising and dereplication.</description>
 	<requirements>
                <requirement type="package" version="2.0.1">frogs</requirement>
        </requirements>
        <stdio>
                <exit_code range="1:" />
                <exit_code range=":-1" />
        </stdio>
	<command>
		preprocess.py $sequencer_type.sequencer_selected
		--output-dereplicated $dereplicated_file --output-count $count_file --summary $summary_file 
		--nb-cpus $nb_cpus
		#if $sequencer_type.sequencer_selected == "illumina"
		    --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size 
		    #if $sequencer_type.sequencing_protocol.sequencing_protocol_selected == "standard"
		        --five-prim-primer $sequencer_type.sequencing_protocol.five_prim_primer --three-prim-primer $sequencer_type.sequencing_protocol.three_prim_primer  
		    #else
		        --without-primers
		    #end if
		#else
		    --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size 
		    --five-prim-primer $sequencer_type.five_prim_primer --three-prim-primer $sequencer_type.three_prim_primer 
		#end if
		
		#if $sequencer_type.input_type.input_type_selected == "archive" 
		    --input-archive $sequencer_type.input_type.archive_file
		    #if $sequencer_type.sequencer_selected == "illumina" and $sequencer_type.input_type.archive_type.archive_type_selected == "already_contiged"
			    --already-contiged
		    #elif $sequencer_type.sequencer_selected == "illumina"
	            --R1-size $sequencer_type.input_type.archive_type.R1_size --R2-size $sequencer_type.input_type.archive_type.R2_size 
	            --expected-amplicon-size $sequencer_type.input_type.archive_type.expected_amplicon_size 
	            --mismatch-rate $sequencer_type.input_type.archive_type.mm_rate
		    #end if
		#else
		    #set $sep = ' '
		    #if $sequencer_type.sequencer_selected == "illumina"
	            --samples-names  
	            #for $current in $sequencer_type.input_type.files_by_samples_type.samples 
	                $sep"${current.name.strip()}"
	            #end for
		        --input-R1 
		        #for $current in $sequencer_type.input_type.files_by_samples_type.samples 
		            $sep${current.R1_file}
		        #end for
		        #if $sequencer_type.input_type.files_by_samples_type.files_by_samples_type_selected == "already_contiged"
		            --already-contiged
		        #else
		            --input-R2
		            #for $current in $sequencer_type.input_type.files_by_samples_type.samples 
		                $sep${current.R2_file}
		            #end for
		            --R1-size $sequencer_type.input_type.files_by_samples_type.R1_size --R2-size $sequencer_type.input_type.files_by_samples_type.R2_size 
		            --expected-amplicon-size $sequencer_type.input_type.files_by_samples_type.expected_amplicon_size 
		            --mismatch-rate $sequencer_type.input_type.files_by_samples_type.mm_rate
		        #end if
		    #else
		        --input-R1 
		        #for $current in $sequencer_type.input_type.samples 
		            $sep${current.R1_file}
		        #end for
		        --samples-names  
		        #for $current in $sequencer_type.input_type.samples 
		            $sep"${current.name.strip()}"
		        #end for
		    #end if
		#end if
	</command>
	<inputs>
		<param name="nb_cpus" type="hidden" label="CPU number" help="The maximum number of CPUs used." value="1" />
		<conditional name="sequencer_type">
			<param name="sequencer_selected" type="select" label="Sequencer" help="Select the sequencing technology used to produce the sequences.">
				<option value="illumina" selected="true">Illumina</option>
				<option value="454">454</option>
			</param>
			<when value="illumina">
				<!-- Samples -->
				<conditional name="input_type">
					<param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with two files (R1 and R2) by sample.">
						<option value="files_by_samples" selected="true">Files by samples</option>
						<option value="archive">Archive</option>
					</param>
					<when value="archive">
						<param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file(s) for each sample." optional="false" />
						<conditional name="archive_type">
							<param name="archive_type_selected" type="select" label="Reads already contiged ?" help="The archive contains 1 file by sample : R1 and R2 are already merged by pair.">
								<option value="paired" selected="true">No</option>
								<option value="already_contiged">Yes</option>
							</param>
							<when value="paired">
								<!-- Reads size -->
								<param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" />
								<param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" />
								<param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" />
								<param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatch in the overlap region" value="0.1" optional="false" />
							</when>
							<when value="already_contiged"></when>
						</conditional>
					</when>
					<when value="files_by_samples">
						<conditional name="files_by_samples_type">
							<param name="files_by_samples_type_selected" type="select" label="Reads already contiged ?" help="The inputs contain 1 file by sample : R1 and R2 are already merged by pair.">
								<option value="paired" selected="true">No</option>
								<option value="already_contiged">Yes</option>
							</param>
							<when value="paired">
								<!-- Samples -->
								<repeat name="samples" title="Samples" min="1">
									<param name="name" type="text" label="Name" help="The sample name." optional="false">
										<validator type="empty_field" message="This parameter is required." />
									</param>
									<param format="fastq" name="R1_file" type="data" label="Reads 1" help="R1 FASTQ file of paired-end reads." />
									<param format="fastq" name="R2_file" type="data" label="reads 2" help="R2 FASTQ file of paired-end reads." />
								</repeat>
								<!-- Reads size -->
								<param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" />
								<param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" />
								<param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" />
								<param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatches in the overlap region" value="0.1" optional="false" />
							</when>
							<when value="already_contiged">
								<repeat name="samples" title="Samples" min="1">
									<param name="name" type="text" label="Name" help="The sample name." optional="false">
										<validator type="empty_field" message="This parameter is required." />
									</param>
									<param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of merged reads." />
								</repeat>
							</when>
						</conditional>
					</when>
				</conditional>
				<!-- Amplicons -->
				<param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons." value="" optional="false" />
				<param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons." value="" optional="false" />
				<!-- Primers -->
				<conditional name="sequencing_protocol">
					<param name="sequencing_protocol_selected" type="select" label="Sequencing protocol" help="The protocol used for sequencing step: standard or custom with PCR primers as sequencing primers.">
						<option value="standard" selected="true">Illumina standard</option>
						<option value="without_primers">Custom protocol (Kozich et al. 2013)</option>
					</param>
					<when value="standard">
						<param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
							<validator type="empty_field" message="This parameter is required." />
						</param>
						<param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
							<validator type="empty_field" message="This parameter is required." />
						</param>
					</when>
					<when value="without_primers"></when>
				</conditional>
			</when>
			
			<when value="454">
				<!-- Samples -->
				<conditional name="input_type">
					<param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with one file by sample.">
						<option value="files_by_samples" selected="true">One file by sample</option>
						<option value="archive">Archive</option>
					</param>
					<when value="archive">
						<param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file for each sample." optional="false" />
					</when>
					<when value="files_by_samples">
						<repeat name="samples" title="Samples" min="1">
							<param name="name" type="text" label="Name" help="The sample name." optional="false" />
							<param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of sample." />
						</repeat>
					</when>
				</conditional>
				<!-- Amplicons -->
				<param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons (with primers)." value="" optional="false" />
				<param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons (with primers)." value="" optional="false" />
				<!-- Primers -->
				<param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
					<validator type="empty_field" message="This parameter is required." />
				</param>
				<param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
					<validator type="empty_field" message="This parameter is required." />
				</param>
			</when>
		</conditional>
	</inputs>
	<outputs>
		<data format="fasta" name="dereplicated_file" label="${tool.name}: dereplicated.fasta" from_work_dir="dereplicated.fasta" />
		<data format="tabular" name="count_file" label="${tool.name}: count.tsv" from_work_dir="count.tsv" />
		<data format="html" name="summary_file" label="${tool.name}: report.html" from_work_dir="report.html" />
	</outputs>
        <tests>
                <test>
                        <conditional name="sequencer_type">
				<param name="sequencer_selected" value="illumina"/>
                                <conditional name="input_type">
					<param name="input_type_selected" value="archive"/>
					<param name="archive_file" value="test_dataset.tar.gz"/>
					<conditional name="archive_type">
						<param name="archive_type_selected" value="paired"/>
						<param name="R1_size" value="250"/>
						<param name="R2_size" value="250"/>
						<param name="expected_amplicon_size" value="420"/>
						<param name="mm_rate" value="0.15"/>
					</conditional>
				</conditional>
				<param name="min_amplicon_size" value="380"/>
				<param name="max_amplicon_size" value="460"/>
				<conditional name="sequencing_protocol">
					<param name="sequencing_protocol_selected" value="standard"/>
					<param name="five_prim_primer" value="GGCGVACGGGTGAGTAA"/>
					<param name="three_prim_primer" value="GTGCCAGCNGCNGCGG"/>
				</conditional>
                        </conditional>
			<output name="dereplicated_file" file="references/01-prepro.fasta"/>
			<output name="count_file" file="references/01-prepro.tsv"/>
			<output name="summary_file" file="references/01-prepro.html" compare="sim_size" delta="0"/>
                </test>
        </tests>
	<help>

.. image:: static/images/FROGS_logo.png
   :height: 144
   :width: 110


.. class:: infomark page-header h2

What it does

FROGS Pre-process filters and dereplicates amplicons for use in diversity analysis.


.. class:: infomark page-header h2

Inputs/Outputs

.. class:: h3

Inputs

Sample files added one after another or provide in an archive file (tar.gz).

.. container:: row

 .. container:: col-md-6

  **Illumina inputs**

   :Usage: For samples sequenced in paired-end. The amplicon length must be inferior to the length of the R1 plus R2 length. R1 and R2 are merged by the common region.
   :Files: One R1 and R2 by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_)
   :Example: splA_R1.fastq.gz,  splA_R2.fastq.gz,  splB_R1.fastq.gz,  splB_R2.fastq.gz

  OR

   :Usage: For samples sequenced in single-ends or when R1 and R2 reads are already merged.
   :Files: One sequence file by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_).
   :Example: splA.fastq.gz,  splB.fastq.gz

 .. container:: col-md-6

  **454 inputs**

   :Files: One sequence file by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_)
   :Example: splA.fastq.gz,  splB.fastq.gz

Remark: In an archive if you use R1 and R2 files they names must end with *_R1* and *_R2*. To upload an archive, see the "Upload archive" tool or if possible create symbolic link on your Galaxy account.

.. class:: h3

Outputs

**Sequence file** (dereplicated.fasta):

 Only one file with all samples sequences (format `FASTA &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_). These sequences are dereplicated: strictly identical sequence are represented only one and the initial count is kept in count file.

**Count file** (count.tsv):

 This file contains the count of all unique sequences in each sample (format `TSV &lt;https://en.wikipedia.org/wiki/Tab-separated_values&gt;`_).

**Summary file** (report.html):

 This file reports the number of remaining sequences after each filter (format `HTML &lt;https://en.wikipedia.org/wiki/HTML&gt;`_).
 
 .. image:: static/images/FROGS_preprocess_summary.png 
     :height: 355
     :width: 676 

 It also presents the length distribution of the remaining sequences.
 
 .. image:: static/images/FROGS_preprocess_lengthsSamples.png 
     :height: 350
     :width: 676 

.. class:: infomark page-header h2

How it works

.. csv-table:: 
   :header: "Steps", "Illumina", "454"
   :widths: 5, 150, 150
   :class: table table-striped

   "1", "For un-merged data: merges R1 and R2 with a maximum of M% mismatch in the overlaped region (`FLASh &lt;http://ccb.jhu.edu/software/FLASH/&gt;`_). By default M is set to 10%", "/"
   "2", "Filters merged sequences on their length which must be range between 'Minimum amplicon size' and 'Maximum amplicon size'", "/"
   "3", "If sequencing protocol is the illumina standard protocol : Removes sequences where the two primers are not present and then remove primers in the remaining sequence (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences", "Removes sequences where the two primers are not present, removes primers sequence and reverse complement the sequences on strand -  (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences"
   "4", "Filters sequences on their length and with ambiguous nucleotides", "the tool removes sequences with at least one homopolymer with more than seven nucleotides and with a distance of less than or equal to 10 nucleo-tides between two poor quality positions, i.e. with a Phred quality score lesser than 10" 
   "5", "Dereplicates sequences", "Dereplicates sequences"


.. class:: infomark page-header h2

Advices/details on parameters

.. class:: h3

Primers parameters

The (`Kozich et al. 2013 &lt;http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3753973/&gt;`_ ) protocol uses custom sequencing primers which are also the PCR primers. In this case the reads do not contain the PCR primers.

In case of Illumina standard protocol, the primers must be provided in 5' to 3' orientation. 

.. role:: alert-info

Example:

 5' :alert-info:`ATGCCC` GTCGTCGTAAAATGC :alert-info:`ATTTCAG` 3'
 
 Value for parameter 5' primer: ATGCC
 
 Value for parameter 3' primer: ATTTCAG

.. class:: h3

Amplicons sizes parameters

 The two following images show two examples of perfect values fors sizes parameters.

 .. image:: static/images/FROGS_preprocess_ampliconSize_unimodal.png
    :height: 415
    :width: 676
 
 .. image:: static/images/FROGS_preprocess_ampliconSize_multimodal.png
    :height: 415
    :width: 676

 Don't worry the "Expected amplicon size" does not need to be very accurate.

.. class:: h3

If the filter 'overlapped' reduce drasticaly the number of sequences:

 In un-merged Illumina data, the reduction of dataset by the overlapped filter is classicaly inferior than 20%. A loss of more than 20% in all samples can highlight a quality problem.
 
 If the overlap between R1 and R2 is superior to 50 nucleotides and the quality of the end of the sequences is poor (see `FastQC &lt;http://www.bioinformatics.babraham.ac.uk/projects/fastqc/&gt;`_) you can try to cut the end of your sequences and relaunch the preprocess tool. 
 You can either raise the mismatch percent in the overlapped region, but not too much!

----

**Contact**

Contacts: frogs@inra.fr

Repository: https://github.com/geraldinepascal/FROGS

Please cite the FROGS Publication: *Frederic Escudie, Lucas Auer, Maria Bernard, Mahendra Mariadassou, Laurent Cauquil, Katia Vidal, Sarah Maman, Guillermina Hernandez-Raquet, Sylvie Combes, Geraldine Pascal; FROGS: Find, Rapidly, OTUs with Galaxy Solution, Bioinformatics, , btx791,* https://doi.org/10.1093/bioinformatics/btx791

Depending on the help provided you can cite us in acknowledgements, references or both.
	</help> 
	<citations>
		<citation type="doi">10.1093/bioinformatics/btx791</citation>
		<citation type="doi">10.1128/AEM.01043-13</citation>
		<citation type="doi">10.14806/ej.17.1.200</citation>
		<citation type="doi">10.1093/bioinformatics/btr507</citation>
	</citations>
</tool>