view fastp.xml @ 0:988729b728f0 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fastp commit 2e41f5d3dd8201a5f8295ec30095c508250a8304
author iuc
date Fri, 09 Mar 2018 20:21:51 -0500
parents
children f44e93b4529c
line wrap: on
line source

<tool id="fastp" name="fastp" version="@WRAPPER_VERSION@.0">
    <description>- fast all-in-one preprocessing for FASTQ files</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="@WRAPPER_VERSION@">fastp</requirement>
    </requirements>
    <version_command>fastp --version | tail -n 1</version_command>
    <command detect_errors="exit_code"><![CDATA[

## Link input files

#if $in1.is_of_type('fastq.gz')
    #set ext = 'fastq.gz'
#else
    #set ext = 'fastq'
#end if

ln -s '$in1' input1.$ext &&

#if str($single_paired.single_paired_selector) == 'paired':
    ln -s '$in2' input2.$ext &&
#end if


## Run fastp

fastp

#if $in1.is_of_type('fastqillumina', 'fastqsolexa', 'fastqillumina.gz', 'fastqsolexa.gz'):
    --phred64
#end if

-i input1.$ext
-o first.$ext

#if str($single_paired.single_paired_selector) == 'paired':
    -I input2.$ext
    -O second.$ext
#end if

## Adapter trimming options

$single_paired.adapter_trimming_options.disable_adapter_trimming

#if str($single_paired.adapter_trimming_options.adapter_sequence1):
    --adapter_sequence '$single_paired.adapter_trimming_options.adapter_sequence1'
#end if

#if str($single_paired.single_paired_selector) == 'paired':
    #if str($single_paired.adapter_trimming_options.adapter_sequence2):
        --adapter_sequence_r2 '$single_paired.adapter_trimming_options.adapter_sequence2'
    #end if
#end if


## Global trimming options

#if str($single_paired.global_trimming_options.trim_front1):
    -f $single_paired.global_trimming_options.trim_front1
#end if

#if str($single_paired.global_trimming_options.trim_tail1):
    -t $single_paired.global_trimming_options.trim_tail1
#end if

#if str($single_paired.single_paired_selector) == 'paired':
    #if str($single_paired.global_trimming_options.trim_front2):
        -F $single_paired.global_trimming_options.trim_front2
    #end if
    #if str($single_paired.global_trimming_options.trim_tail2):
        -T $single_paired.global_trimming_options.trim_tail2
    #end if
#end if


## PolyG tail trimming, useful for NextSeq/NovaSeq data

#if $polyg_tail_trimming.trimming_select in ['', '-g']:
    #if str($polyg_tail_trimming.poly_g_min_len):
        --poly_g_min_len $polyg_tail_trimming.poly_g_min_len
    #end if
    $polyg_tail_trimming.trimming_select
#end if

## Per read cutting by quality options

#if $cutting_by_quality_options.cut_by_quality5 or $cutting_by_quality_options.cut_by_quality3:

    $cutting_by_quality_options.cut_by_quality5

    $cutting_by_quality_options.cut_by_quality3

    #if str($cutting_by_quality_options.cut_window_size):
        -W $cutting_by_quality_options.cut_window_size
    #end if
    #if str($cutting_by_quality_options.cut_mean_quality):
        -M $cutting_by_quality_options.cut_mean_quality
    #end if
#end if


## Quality filtering options

$quality_filtering_options.disable_quality_filtering

#if str($quality_filtering_options.qualified_quality_phred):
    -q $quality_filtering_options.qualified_quality_phred
#end if
#if str($quality_filtering_options.unqualified_percent_limit):
    -u $quality_filtering_options.unqualified_percent_limit
#end if
#if str($quality_filtering_options.n_base_limit):
    -n $quality_filtering_options.n_base_limit
#end if


## Length filtering options

$length_filtering_options.disable_length_filtering

#if str($length_filtering_options.length_required):
    -l $length_filtering_options.length_required
#end if


## Base correction by overlap analysis options

$base_correction_options.correction


## UMI processing

#if $umi_processing.umi:
    $umi_processing.umi
    #if str($umi_processing.umi_loc):
        --umi_loc '$umi_processing.umi_loc'
    #end if
    #if str($umi_processing.umi_len):
        --umi_len $umi_processing.umi_len
    #end if
    #if str($umi_processing.umi_prefix):
        --umi_prefix '$umi_processing.umi_prefix'
    #end if
#end if

## Overrepresented sequence analysis

$overrepresented_sequence_analysis.overrepresentation_analysis

#if str($overrepresented_sequence_analysis.overrepresentation_sampling):
    -P $overrepresented_sequence_analysis.overrepresentation_sampling
#end if


## Thread options
--thread \${GALAXY_SLOTS:-1}

&&

mv first.$ext '${out1}'
#if str($single_paired.single_paired_selector) == 'paired':
    &&
    mv second.$ext '${out2}'
#end if
]]></command>
    <inputs>
        <conditional name="single_paired">
            <param name="single_paired_selector" type="select" label="Single-end or paired reads">
                <option value="single" selected="true">Single-end</option>
                <option value="paired">Paired</option>
            </param>
            <when value="single">
                <expand macro="in1" />
                <section name="adapter_trimming_options" title="Adapter trimming options" expanded="False">
                    <param name="disable_adapter_trimming" argument="-A" type="boolean" truevalue="-A" falsevalue="" checked="false" label="Disable adapter trimming" help="Adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled."/>
                    <expand macro="adapter_sequence1" />
                </section>
                <section name="global_trimming_options" title="Global trimming options" expanded="False">
                    <param name="trim_front1" argument="-f" type="integer" optional="true" label="Trim front for input 1" help="Trimming how many bases in front for read1, default is 0."/>
                    <param name="trim_tail1" argument="-t" type="integer" optional="true" label="Trim tail for input 1" help="Trimming how many bases in tail for read1, default is 0."/>
                </section>
            </when>
            <when value="paired">
                <expand macro="in1" />
                <param name="in2" argument="-I" type="data" format="fastq,fastq.gz" optional="true" label="Input 2" help="Input FASTQ file #2."/>
                <section name="adapter_trimming_options" title="Adapter trimming options" expanded="False">
                    <param name="disable_adapter_trimming" argument="-A" type="boolean" truevalue="-A" falsevalue="" checked="false" label="Disable adapter trimming" help="Adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled."/>
                    <expand macro="adapter_sequence1" />
                    <param name="adapter_sequence2" argument="--adapter_sequence_r2" type="text" optional="true" label="Adapter sequence for input 2" help="The adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as adapter sequence value for input 1.">
                        <sanitizer>
                            <valid>
                                <add value="A"/>
                                <add value="T"/>
                                <add value="C"/>
                                <add value="G"/>
                            </valid>
                        </sanitizer>
                    </param>
                </section>
                <section name="global_trimming_options" title="Global trimming options" expanded="False">
                    <param name="trim_front1" argument="-f" type="integer" optional="true" label="Trim front for input 1" help="Trimming how many bases in front for read1, default is 0."/>
                    <param name="trim_tail1" argument="-t" type="integer" optional="true" label="Trim tail for input 1" help="Trimming how many bases in tail for read1, default is 0."/>
                    <param name="trim_front2" argument="-F" type="integer" optional="true" label="Trim front for input 2" help="Trimming how many bases in front for read2. If it's not specified, it will follow read1's settings."/>
                    <param name="trim_tail2" argument="-T" type="integer" optional="true" label="Trim tail for input 2" help="Trimming how many bases in tail for read2. If it's not specified, it will follow read1's settings."/>
                </section>
            </when>
        </conditional>
        <conditional name="polyg_tail_trimming">
            <param name="trimming_select" type="select" label="PolyG tail trimming, useful for NextSeq/NovaSeq data">
                <option value="" selected="true">Automatic trimming for Illumina NextSeq/NovaSeq data</option>
                <option value="-g">Force polyG tail trimming</option>
                <option value="-G">Disable polyG tail trimming</option>
            </param>
            <when value="-g">
                <expand macro="poly_g_min_len" />
            </when>
            <when value="">
                <expand macro="poly_g_min_len" />
            </when>
            <when value="-G" />
        </conditional>
        <section name="cutting_by_quality_options" title="Per read cutting by quality options" expanded="False">
            <param name="cut_by_quality5" argument="-5" type="boolean" truevalue="-5" falsevalue="" checked="false" label="Cut by quality in front (5')" help="Enable per read cutting by quality in front (5'), default is disabled (WARNING: this will interfere deduplication for both PE/SE data)."/>
            <param name="cut_by_quality3" argument="-3" type="boolean" truevalue="-3" falsevalue="" checked="false" label="Cut by quality in tail (3')" help="Enable per read cutting by quality in tail (3'), default is disabled (WARNING: this will interfere deduplication for SE data)."/>
            <param name="cut_window_size" argument="-W" type="integer" optional="true" label="Cutting window size" help="The size of the sliding window for sliding window trimming, default is 4."/>
            <param name="cut_mean_quality" argument="-M" type="integer" optional="true" label="Cutting mean quality" help="The bases in the sliding window with mean quality below cutting_quality will be cut, default is Q20."/>
        </section>
        <section name="quality_filtering_options" title="Quality filtering options" expanded="False">
            <param name="disable_quality_filtering" argument="-Q" type="boolean" truevalue="-Q" falsevalue="" checked="false" label="Disable quality filtering" help="Quality filtering is enabled by default. If this option is specified, quality filtering is disabled."/>
            <param name="qualified_quality_phred" argument="-q" type="integer" optional="true" label="Qualified quality phred" help="The quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified."/>
            <param name="unqualified_percent_limit" argument="-u" type="integer" optional="true" label="Unqualified percent limit" help="How many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%."/>
            <param name="n_base_limit" argument="-n" type="integer" optional="true" label="N base limit" help="If one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5."/>
        </section>
        <section name="length_filtering_options" title="Length filtering options" expanded="False">
            <param name="disable_length_filtering" argument="-L" type="boolean" truevalue="-L" falsevalue="" checked="false" label="Disable length filtering" help="Length filtering is enabled by default. If this option is specified, length filtering is disabled."/>
            <param name="length_required" argument="-l" type="integer" optional="true" label="Length required" help="Reads shorter than this value will be discarded, default is 15."/>
        </section>
        <section name="base_correction_options" title="Base correction by overlap analysis options" expanded="False">
            <param name="correction" argument="-c" type="boolean" truevalue="-c" falsevalue="" checked="false" label="Enable base correction" help="Enable base correction in overlapped regions (only for PE data), default is disabled."/>
        </section>
        <section name="umi_processing" title="UMI processing" expanded="False">
            <param name="umi" argument="-U" type="boolean" truevalue="-U" falsevalue="" checked="false" label="Enable unique molecular identifer" help="Enable unique molecular identifer (UMI) preprocessing."/>
            <param name="umi_loc" argument="--umi_loc" type="text" optional="true" label="UMI location" help="Specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read, default is none."/>
            <param name="umi_len" argument="--umi_len" type="integer" optional="true" label="UMI length" help="If the UMI is in read1/read2, its length should be provided."/>
            <param name="umi_prefix" argument="--umi_prefix" type="text" optional="true" label="UMI prefix" help="If specified, an underline will be used to connect prefix and UMI (i.e. prefix=UMI, UMI=AATTCG, final=UMI_AATTCG). No prefix by default."/>
        </section>
        <section name="overrepresented_sequence_analysis" title="Overrepresented sequence analysis" expanded="False">
            <param name="overrepresentation_analysis" argument="-p" type="boolean" truevalue="-p" falsevalue="" checked="false" label="Enable overrepresented analysis" help="Enable overrepresented sequence analysis."/>
            <param name="overrepresentation_sampling" argument="-P" type="integer" optional="true" label="Overrepresentation sampling" help="One in (--overrepresentation_sampling) reads will be computed for overrepresentation analysis (1~10000), smaller is slower, default is 20."/>
        </section>
        <section name="output_options" title="Output options" expanded="False">
            <param name="report_html" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Output HTML report" help="fastp provides a QC report for the data Before and After filtering within a single HTML page, which enables comparison of the quality statistics changed by the preprocessing step directly"/>
            <param name="report_json" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output JSON report" help="The JSON report contains all the data visualized in the HTML report. The format of the JSON report is manually optimized to be easily readable by humans"/>
        </section>
    </inputs>

    <outputs>
        <data name="out1" format_source="in1" label="${tool.name} on ${on_string}: Read 1 Output"/>
        <data name="out2" format_source="in2" label="${tool.name} on ${on_string}: Read 2 Output">
            <filter>single_paired['single_paired_selector'] == "paired"</filter>
        </data>
        <data name="report_html" format="html" from_work_dir="fastp.html" label="${tool.name} on ${on_string}: HTML Report">
            <filter>output_options['report_html'] is True</filter>
        </data>
        <data name="report_json" format="json" from_work_dir="fastp.json" label="${tool.name} on ${on_string}: JSON Report">
            <filter>output_options['report_json'] is True</filter>
        </data>
    </outputs>

    <tests>
        <test>
            <param name="in1" value="R1.fq" ftype="fastqsanger"/>
            <param name="single_paired_selector" value="single"/>
            <output name="out1" file="out1.fq" ftype="fastqsanger"/>
        </test>
        <test>
            <param name="in1" value="R1.fq" ftype="fastq"/>
            <param name="single_paired_selector" value="single"/>
            <param name="adapter_sequence1" value="ATCG"/>
            <output name="out1" file="out_a.fq" ftype="fastq"/>
        </test>
        <test>
            <param name="in1" value="R1.fq" ftype="fastq"/>
            <param name="single_paired_selector" value="single"/>
            <section name="umi_processing">
                <param name="umi" value="true"/>
                <param name="umi_loc" value="read1"/>
                <param name="umi_len" value="8"/>
            </section>
            <output name="out1" file="out2.fq" ftype="fastq"/>
        </test>
        <test>
            <param name="in1" value="R1.fq" ftype="fastq"/>
            <param name="single_paired_selector" value="single"/>
            <section name="umi_processing">
                <param name="umi" value="true"/>
                <param name="umi_loc" value="read1"/>
                <param name="umi_len" value="12"/>
            </section>
            <output name="out1" file="out3.fq" ftype="fastq"/>
        </test>
        <test>
            <param name="in1" value="bwa-mem-fastq1.fq" ftype="fastq"/>
            <param name="in2" value="bwa-mem-fastq2.fq" ftype="fastq"/>
            <param name="single_paired_selector" value="paired"/>
            <output name="out1" file="out_bwa1.fq" ftype="fastq"/>
            <output name="out2" file="out_bwa2.fq" ftype="fastq"/>
        </test>
        <test>
            <param name="in1" value="bwa-mem-fastq1.fq" ftype="fastq"/>
            <param name="in2" value="bwa-mem-fastq2.fq" ftype="fastq"/>
            <param name="single_paired_selector" value="paired"/>
            <section name="umi_processing">
                <param name="umi" value="true"/>
                <param name="umi_loc" value="read1"/>
                <param name="umi_len" value="8"/>
            </section>
            <output name="out1" file="out_bwa_umi_read1_1.fq" ftype="fastq"/>
            <output name="out2" file="out_bwa_umi_read1_2.fq" ftype="fastq"/>
        </test>
        <test>
            <param name="in1" value="bwa-mem-fastq1.fq" ftype="fastq"/>
            <param name="in2" value="bwa-mem-fastq2.fq" ftype="fastq"/>
            <param name="single_paired_selector" value="paired"/>
            <section name="umi_processing">
                <param name="umi" value="true"/>
                <param name="umi_loc" value="read2"/>
                <param name="umi_len" value="8"/>
            </section>
            <output name="out1" file="out_bwa_umi_read2_1.fq" ftype="fastq"/>
            <output name="out2" file="out_bwa_umi_read2_2.fq" ftype="fastq"/>
        </test>
        <test>
            <param name="in1" value="R1.fq" ftype="fastq"/>
            <param name="single_paired_selector" value="single"/>
            <output name="out1" file="out1.fq" ftype="fastq"/>
            <output name="report_html">
                <assert_contents>
                    <has_text text="fastp report"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="in1" value="R1.fq.gz" ftype="fastq.gz"/>
            <param name="single_paired_selector" value="single"/>
            <param name="trimming_select" value="-g"/>
            <param name="poly_g_min_len" value="10"/>
            <output name="out1" file="out1.fq.gz" ftype="fastq.gz" compare="sim_size"/>
            <output name="report_html">
                <assert_contents>
                    <has_text text="fastp report"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

fastp_ is a tool designed to provide fast all-in-one preprocessing for FASTQ files. This tool is developed in C++ with multithreading supported to afford high performance.

*Features*

1. Filter out bad reads (too low quality, too short, or too many N...)

2. Cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster).

3. Trim all reads in front and tail

4. Cut adapters. Adapter sequences can be automatically detected,which means you don't have to input the adapter sequences to trim them.

5. Correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality

6. Preprocess unique molecular identifer (UMI) enabled data, shift UMI to sequence name.

7. Report JSON format result for further interpreting.

8. Visualize quality control and filtering results on a single HTML page (like FASTQC but faster and more informative).

9. Split the output to multiple files (0001.R1.gz, 0002.R1.gz...) to support parallel processing. Two modes can be used, limiting the total split file number, or limitting the lines of each split file.

10. Support long reads (data from PacBio / Nanopore devices).

-----

**Inputs**

Single-end or Paired-end FASTQ or FASTQ.GZ reads

-----

**Outputs**

    * Processed reads

Optionally, under **Output Options** you can choose to output

    * HTML report (default is Yes)
    * JSON report

.. _fastp: https://github.com/OpenGene/fastp

]]></help>
    <citations>
        <citation type="doi">10.1101/274100</citation>
    </citations>
</tool>