view mgnify_seqprep.xml @ 0:76ea9d4604bc draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/mgnify_seqprep commit fd696b8f2ce44287b6ad19fe52277cfdbd7e94fb
author bgruening
date Tue, 14 May 2024 09:49:32 +0000
parents
children
line wrap: on
line source

<tool id="mgnify_seqprep" name="Merging paired-end Illumina reads (SeqPrep, modified for use with MGnify piplines)" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
    <description>Merge and Trim Adapter Sequences from Paired-End Illumina Reads</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <expand macro="creators"/>
    <command detect_errors="exit_code"><![CDATA[
    SeqPrep
        -f '${input1}'
        -r '${input2}'
        -1 '${output1}'
        -2 '${output2}'

        #if $merge_reads
            -s '${merged}'
        #end if

        ## General Arguments ##
        #if $general_options.first_read_discarded
            -3 '${general_options.first_read_discarded}'
        #end if
        #if $general_options.second_read_discarded
            -4 '${general_options.second_read_discarded}'
        #end if
        #if $general_options.phred64
            -6 '${general_options.phred64}'
        #end if
        #if $general_options.quality_cutoff
            -q '${general_options.quality_cutoff}'
        #end if
        #if $general_options.min_length
            -L '${general_options.min_length}'
        #end if

        ## Additional Adapter/Primer Trimming Arguments ##
        #if $trimming_options.adapter_a
            -A '${trimming_options.adapter_a}'
        #end if
        #if $trimming_options.adapter_b
            -B '${trimming_options.adapter_b}'
        #end if
        #if $trimming_options.adapter_overlap
            -O '${trimming_options.adapter_overlap}'
        #end if
        #if $trimming_options.max_mismatch_fraction
            -M '${trimming_options.max_mismatch_fraction}'
        #end if
        #if $trimming_options.min_match_fraction
            -N '${trimming_options.min_match_fraction}'
        #end if
        #if $trimming_options.adapter_bandwidth
            -b '${trimming_options.adapter_bandwidth}'
        #end if
        #if $trimming_options.gap_open
            -Q '${trimming_options.gap_open}'
        #end if
        #if $trimming_options.gap_extend
            -t '${trimming_options.gap_extend}'
        #end if
        #if $trimming_options.gap_end
            -e '${trimming_options.gap_end}'
        #end if
        #if $trimming_options.local_alignment_score
            -Z '${trimming_options.local_alignment_score}'
        #end if
        #if $trimming_options.read_alignment_bandwidth
            -w '${trimming_options.read_alignment_bandwidth}'
        #end if
        #if $trimming_options.read_alignment_gap_open
            -W '${trimming_options.read_alignment_gap_open}'
        #end if
        #if $trimming_options.read_alignment_gap_extend
            -p '${trimming_options.read_alignment_gap_extend}'
        #end if
        #if $trimming_options.read_alignment_gap_end
            -P '${trimming_options.read_alignment_gap_end}'
        #end if
        #if $trimming_options.read_alignment_max_gap_fraction
            -X '${trimming_options.read_alignment_max_gap_fraction}'
        #end if

        ## Additional Arguments for Merging ##
        #if $merging_options.maximum_quality_score
            -y '${merging_options.maximum_quality_score}'
        #end if
        #if $merging_options.print_overhang
            -g '${merging_options.print_overhang}'
        #end if
        #if $merging_options.min_base_pair_overlap
            -o '${merging_options.min_base_pair_overlap}'
        #end if
        #if $merging_options.max_mismatch_fraction
            -m '${merging_options.max_mismatch_fraction}'
        #end if
        #if $merging_options.min_match_fraction
            -n '${merging_options.min_match_fraction}'
        #end if
    ]]></command>
    <inputs>
        <param name="input1" type="data" format="fastq" label="First Read Input" help="Select the FASTQ file containing the first set of paired-end reads." />
        <param name="input2" type="data" format="fastq" label="Second Read Input" help="Select the FASTQ file containing the second set of paired-end reads." />
        <param name="merge_reads" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Merge Reads" help="Enable this to merge overlapping reads from the provided paired-end FASTQ files." />        

        <!-- Section for General Arguments -->
        <section name="general_options" title="General Arguments (Optional)" expanded="false">
            <param name="first_read_discarded" argument="-3" type="text" optional="true" label="First Read Discarded FASTQ Filename" help="first read discarded fastq filename" />
            <param name="second_read_discarded" argument="-4" type="text" optional="true" label="Second Read Discarded FASTQ Filename" help="second read discarded fastq filename" />
            <param name="phred64" argument="-6" type="boolean" truevalue="-6" falsevalue="" checked="false" label="Input Sequence is in Phred+64 Format" help="Input sequence is in phred+64 rather than phred+33 format, the output will still be phred+33"/>
            <param name="quality_cutoff" argument="-q" type="integer" optional="true" value="13" label="Quality Score Cutoff" help="Quality score cutoff for mismatches to be counted in overlap" />
            <param name="min_length" argument="-L" type="integer" optional="true" value="30" label="Minimum Length of Reads" help="Minimum length of a trimmed or merged read to print it" />
        </section>

        <!-- Section for Additional Adapter/Primer Trimming Arguments -->
        <section name="trimming_options" title="Additional Adapter/Primer Trimming Arguments" expanded="false">
            <param name="adapter_a" argument="-A" label="Adapter Sequence A" type="text" optional="true" value="AGATCGGAAGAGCGGTTCAG" help="Forward read primer/adapter sequence to trim as it would appear at the end of a read" />
            <param name="adapter_b" argument="-B" label="Adapter Sequence B" type="text" optional="true" value="AGATCGGAAGAGCGTCGTGT" help="Reverse read primer/adapter sequence to trim as it would appear at the end of a read" />
            <param name="adapter_overlap" argument="-O" label="Minimum Overall Base Pair Overlap with Adapter" type="integer" value="10" optional="true" help="minimum overall base pair overlap with adapter sequence to trim" />
            <param name="max_mismatch_fraction" argument="-M" label="Maximum Fraction of Good Quality Mismatching Bases" type="float" value="0.02" optional="true" help="maximum fraction of good quality mismatching bases for primer/adapter overlap" />
            <param name="min_match_fraction" argument="-N" label="Minimum Fraction of Matching Bases" type="float" value="0.87" optional="true" help="minimum fraction of matching bases for primer/adapter overlap" />
            <param name="adapter_bandwidth" argument="-b" label="Adapter Alignment Band-width" type="integer" value="50" optional="true" />
            <param name="gap_open" argument="-Q" label="Adapter Alignment Gap-Open" type="integer" value="8" optional="true" />
            <param name="gap_extend" argument="-t" label="Adapter Alignment Gap-Extension" type="integer" value="2" optional="true" />
            <param name="gap_end" argument="-e" label="Adapter Alignment Gap-End" type="integer" value="2" optional="true" />
            <param name="local_alignment_score" argument="-Z" label="Minimum Local Alignment Score Cutoff" type="integer" value="26" optional="true" help="Adapter alignment minimum local alignment score cutoff [roughly (2*num_hits) - (num_gaps*gap_open) - (num_gaps*gap_close) - (gap_len*gap_extend) - (2*num_mismatches)]" />
            <param name="read_alignment_bandwidth" argument="-w" label="Read Alignment Band-width" type="integer" value="50" optional="true" />
            <param name="read_alignment_gap_open" argument="-W" label="Read Alignment Gap-Open" type="integer" value="26" optional="true" />
            <param name="read_alignment_gap_extend" argument="-p" label="Read Alignment Gap-Extension" type="integer" value="9" optional="true" />
            <param name="read_alignment_gap_end" argument="-P" label="Read Alignment Gap-End" type="integer" value="5" optional="true" help="read alignment maximum fraction gap cutoff" />
            <param name="read_alignment_max_gap_fraction" argument="-X" label="Read Alignment Maximum Fraction Gap Cutoff" type="float" value="0.125" optional="true" />
        </section>

        <!-- Section for Optional Arguments for Merging: -->
        <section name="merging_options" title="Optional Arguments for Merging" expanded="false">
            <param name="maximum_quality_score" argument="-y" label="Maximum Quality Score in Output" type="text" optional="true" help="Maximum quality score in output (phred 33), default = ']'"/>
            <param name="print_overhang" argument="-g" type="boolean" truevalue="-g" falsevalue="" checked="false" label="Print Overhang When Adapters Are Present and Stripped" help="Use this if reads are different lengths"/>
            <param name="min_base_pair_overlap" argument="-o" type="integer" optional="true" value="15" label="Minimum Overall Base Pair Overlap" help="Minimum overall base pair overlap to merge two reads"/>
            <param name="max_mismatch_fraction" argument="-m" type="float" optional="true" value="0.02" label="Maximum Fraction of Good Quality Mismatching Bases" help="Maximum fraction of good quality mismatching bases to overlap reads"/>
            <param name="min_match_fraction" argument="-n" type="float" optional="true" value="0.9" label="Minimum Fraction of Matching Bases" help="Minimum fraction of matching bases to overlap reads"/>
        </section>
    </inputs>
    <outputs>
        <data format="fastq.gz" name="output1" label="${tool.name} on ${on_string}: First Read Output">
            <filter>output_all</filter>
        </data>
        <data format="fastq.gz" name="output2" label="${tool.name} on ${on_string}: Second Read Output">
            <filter>output_all</filter>
        </data>
        <data format="fastq.gz" name="merged" label="${tool.name} on ${on_string}: Merged Reads">
            <filter>merge_reads</filter>
        </data>
    </outputs>
    <tests>
        <!-- Test default inputs #1 -->
        <test expect_num_outputs="3">
            <param name="input1" value="input1.fq" />
            <param name="input2" value="input2.fq" />
            <param name="merge_reads" value="true" />

            <!-- Section for General Arguments -->
            <section name="general_options" >
                <param name="quality_cutoff" value="13" />
                <param name="min_length" value="30" />
            </section>

            <!-- Section for Additional Adapter/Primer Trimming Arguments -->
            <section name="trimming_options">
                <param name="adapter_a" value="AGATCGGAAGAGCGGTTCAG" />
                <param name="adapter_b" value="AGATCGGAAGAGCGTCGTGT" />
                <param name="adapter_overlap" value="10" /> 
                <param name="max_mismatch_fraction" value="0.02" />
                <param name="min_match_fraction" value="0.87" />
                <param name="adapter_bandwidth" value="50" />
                <param name="gap_open" value="8" />
                <param name="gap_extend" value="2" />
                <param name="gap_end" value="2" />
                <param name="local_alignment_score" value="26" />
                <param name="read_alignment_bandwidth" value="50" />
                <param name="read_alignment_gap_open" value="26" />
                <param name="read_alignment_gap_extend" value="9" />
                <param name="read_alignment_gap_end" value="5" />
                <param name="read_alignment_max_gap_fraction" value="0.125" />
            </section>
            <output name="output1" file="output1.fq.gz" />
            <output name="output2" file="output2.fq.gz" />
            <output name="merged" file="merged_output.fq.gz" />
        </test>
    
        <!-- Without Merging, Two Outputs #2 -->
        <test expect_num_outputs="2">
            <param name="input1" value="input1.fq" />
            <param name="input2" value="input2.fq" />
            <param name="merge_reads" value="false" />

            <!-- Section for General Arguments -->
            <section name="general_options" >
                <param name="quality_cutoff" value="13" />
                <param name="min_length" value="30" />
            </section>  

            <!-- Section for Additional Adapter/Primer Trimming Arguments -->
            <section name="trimming_options">
                <param name="adapter_a" value="AGATCGGAAGAGCGGTTCAG" />
                <param name="adapter_b" value="AGATCGGAAGAGCGTCGTGT" />
                <param name="adapter_overlap" value="10" /> 
                <param name="max_mismatch_fraction" value="0.02" />
                <param name="min_match_fraction" value="0.87" />
                <param name="adapter_bandwidth" value="50" />
                <param name="gap_open" value="8" />
                <param name="gap_extend" value="2" />
                <param name="gap_end" value="2" />
                <param name="local_alignment_score" value="26" />
                <param name="read_alignment_bandwidth" value="50" />
                <param name="read_alignment_gap_open" value="26" />
                <param name="read_alignment_gap_extend" value="9" />
                <param name="read_alignment_gap_end" value="5" />
                <param name="read_alignment_max_gap_fraction" value="0.125" />
            </section>
            <output name="output1" file="outputNoMerge1.fq.gz" />
            <output name="output2" file="outputNoMerge2.fq.gz" />
        </test>
        <!-- Test with Empty Input Files #3 -->
        <test expect_num_outputs="3">
            <param name="input1" value="empty1.fq" />
            <param name="input2" value="empty2.fq" />
            <param name="merge_reads" value="true" />

            <!-- Section for General Arguments -->
            <section name="general_options" >
                <param name="quality_cutoff" value="13" />
                <param name="min_length" value="30" />
            </section>  

            <!-- Section for Additional Adapter/Primer Trimming Arguments -->
            <section name="trimming_options">
                <param name="adapter_a" value="AGATCGGAAGAGCGGTTCAG" />
                <param name="adapter_b" value="AGATCGGAAGAGCGTCGTGT" />
                <param name="adapter_overlap" value="10" /> 
                <param name="max_mismatch_fraction" value="0.02" />
                <param name="min_match_fraction" value="0.87" />
                <param name="adapter_bandwidth" value="50" />
                <param name="gap_open" value="8" />
                <param name="gap_extend" value="2" />
                <param name="gap_end" value="2" />
                <param name="local_alignment_score" value="26" />
                <param name="read_alignment_bandwidth" value="50" />
                <param name="read_alignment_gap_open" value="26" />
                <param name="read_alignment_gap_extend" value="9" />
                <param name="read_alignment_gap_end" value="5" />
                <param name="read_alignment_max_gap_fraction" value="0.125" />
            </section>
            <output name="output1" file="empty_output1.fq.gz" />
            <output name="output2" file="empty_output2.fq.gz" />
            <output name="merged" file="empty_merged_output.fq.gz" />
        </test>

        <!-- Advanced Functional Tests -->
        <!-- General Arguments Test #4 -->
        <test expect_num_outputs="2">
            <param name="input1" value="input1.fq" />
            <param name="input2" value="input2.fq" />
            <param name="merge_reads" value="false" />

            <param name="quality_cutoff" value="15" />
            <param name="min_length" value="25" />
            <output name="output1" file="output1_general_args.fq.gz" />
            <output name="output2" file="output2_general_args.fq.gz" />
        </test>

        <!-- Adapter/Primer Trimming Arguements Test #5 -->
        <test expect_num_outputs="2">
            <param name="input1" value="input1.fq" />
            <param name="input2" value="input2.fq" />
            <param name="merge_reads" value="false" />
            <section name="trimming_options">
                <param name="adapter_a" value="ACTGACTG" />
                <param name="adapter_b" value="GTGACTGA" />
                <param name="adapter_overlap" value="12" />
                <param name="max_mismatch_fraction" value="0.03" />
                <param name="min_match_fraction" value="0.85" />
                <param name="adapter_bandwidth" value="55" />
                <param name="gap_open" value="10" />
                <param name="gap_extend" value="3" />
                <param name="gap_end" value="3" />
                <param name="local_alignment_score" value="28" />
            </section>
            <output name="output1" file="output1_adapter_trim.fq.gz" />
            <output name="output2" file="output2_adapter_trim.fq.gz" />
        </test>
        <!-- Test with gzipped input files #6 -->
        <test expect_num_outputs="3">
            <param name="input1" value="input1.fastq.gz" />
            <param name="input2" value="input2.fastq.gz" />
            <param name="merge_reads" value="true" />
            <output name="output1" file="output1_from_gzipped.fq.gz" />
            <output name="output2" file="output2_from_gzipped.fq.gz" />
            <output name="merged" file="merged_output_from_gzipped.fq.gz" />
        </test>
    </tests>
    <help><![CDATA[
.. class:: warningmark

**Caution**
-----------
::

    This is a modified version of the 1.2 release. Made for use with the MGnify pipeline.

    Difference in `utils.h`:

::
    
    -#define MAX_SEQ_LEN (256)
    +#define MAX_SEQ_LEN (1024)

**SeqPrep**
-----------
::

    SeqPrep is a versatile tool designed for merging overlapping paired-end Illumina reads into a single, longer read.
    Additionally, it offers the functionality to trim adapter sequences from reads, making it a needful tool for preprocessing Illumina sequencing data.

**Usage**
=========
::

    To utilize SeqPrep, start by selecting your input FASTQ files: one for the first set of reads and another for the second set.
    SeqPrep provides several options to customize your data processing:

    - Adapter Sequences: You can provide specific sequences for adapter trimming if they are known. SeqPrep will remove these sequences from the reads.
    - Quality Score Cutoff: Set a threshold for the quality score. Reads with quality scores below this threshold can be discarded or trimmed.
    - Minimum Read Length: Define the minimum length for reads to be retained after trimming. Reads shorter than this length will be discarded.

    If the merging feature is enabled, SeqPrep will combine overlapping reads into longer sequences, thereby enhancing the data quality for downstream analysis.

**Outputs**
===========
::

    SeqPrep generates outputs in gzipped FASTQ format.

    See more details on `SeqPrep GitHub repository <https://github.com/jstjohn/SeqPrep>`_.

    ]]></help>
    <expand macro="citations"/>
</tool>