Mercurial > repos > iuc > sequali

<tool id="sequali" name="sequali" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
    <description>Fast sequencing data quality metrics for short and long reads</description>
    <macros>
        <token name="@TOOL_VERSION@">1.0.2</token>
        <token name="@VERSION_SUFFIX@">0</token>
    </macros>
    <xrefs>
        <xref type="bio.tools">sequali</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">sequali</requirement>
    </requirements>
    <version_command><![CDATA[
        sequali --version
        ]]>
    </version_command>
    <command detect_errors="exit_code"><![CDATA[
    #import re
    #if $input_reads.is_collection:
        #set $input_1 = re.sub('[^\w\-_\.]', '_', $input_reads[0].element_identifier)
        #set $input_2 = re.sub('[^\w\-_\.]', '_', $input_reads[1].element_identifier)
        ln -s '${input_reads[0]}' ${input_1} &&
        ln -s '${input_reads[1]}' ${input_2} &&
    #else if $input_type_selector == 'single':
        #set $input_1 = re.sub('[^\w\-_\.]', '_', str($input_reads.element_identifier))
        ln -s '${input_reads}' ${input_1} &&
        #set $input_2 = ''
    #else if $input_type_selector == 'paired':
        #set $input_1 = re.sub('[^\w\-_\.]', '_', str($input_reads.element_identifier))
        ln -s '${input_reads}' '$input_1' &&
        #set $input_2 = re.sub('[^\w\-_\.]', '_', str($input_reads_rev.element_identifier))
        ln -s '${input_reads_rev}' '$input_2' &&
    #end if

    mkdir -p '${html_report.files_path}' &&
    sequali
        #if $adapter:
            --adapter-file '${adapter}'
        #end if
        #if str($overrepresentation_threshold) != '':
            --overrepresentation-threshold-fraction '${overrepresentation_threshold}'
        #end if
        #if str($overrep_min_threshold) != '':
            --overrepresentation-min-threshold '${overrep_min_threshold}'
        #end if
        #if str($overrep_max_threshold) != '':
            --overrepresentation-max-threshold '${overrep_max_threshold}'
        #end if
        #if str($overrep_max_unique_fragments) != '':
            --overrepresentation-max-unique-fragments '${overrep_max_unique_fragments}'
        #end if
        #if str($overrep_fragment_length) != '':
            --overrepresentation-fragment-length '${overrep_fragment_length}'
        #end if
        #if str($overrep_sample_every) != '':
            --overrepresentation-sample-every '${overrep_sample_every}'
        #end if
        #if str($dup_max_stored_fingerprints) != '':
            --duplication-max-stored-fingerprints '${dup_max_stored_fingerprints}'
        #end if
        #if str($fp_front_length) != '':
            --fingerprint-front-length '${fp_front_length}'
        #end if
        #if str($fp_back_length) != '':
            --fingerprint-back-length '${fp_back_length}'
        #end if
        #if str($fp_front_offset) != '':
            --fingerprint-front-offset '${fp_front_offset}'
        #end if
        #if str($fp_back_offset) != '':
            --fingerprint-back-offset '${fp_back_offset}'
        #end if
        --html '$html_report'
        --json '$json_report'
        --threads \${GALAXY_SLOTS:-2}
        '${input_1}'
        #if $input_2:
            '${input_2}'
        #end if

    ]]></command>
    <inputs>
        <conditional name="input_type">
            <param name="input_type_selector" type="select" label="Single file or paired-end reads" help="Select between paired-end reads and single file (fastq, ubam) to process">
                <option value="single">Single file</option>
                <option value="paired">Paired-end</option>
                <option value="paired_collection">Paired Collection</option>
            </param>
            <when value="single">
                <param name="input_reads" type="data" format="fastq,fastq.gz,unsorted.bam" label="Select fastq dataset" help="Specify dataset with single reads (fastq or unaligned bam)."/>
            </when>
            <when value="paired">
                <param name="input_reads" type="data" format="fastq,fastq.gz" label="Forward reads" help="Specify FASTQ(.gz) dataset with forward reads."/>
                <param name="input_reads_rev" type="data" format="fastq,fastq.gz" label="Reverse reads" help="Specify FASTQ(.gz) dataset with reverse reads."/>
            </when>
            <when value="paired_collection">
                <param name="input_reads" type="data_collection" format="fastq,fastq.gz" label="Collection with paired-end reads" help="Specify collection with paired-end reads."/>
            </when>
        </conditional>
        <param argument="--adapter-file" name="adapter" format="tabular" type="data" optional= "true" label="Adapters to search for"
            help="TSV file with header: Name, Sequencing Technology, Probe sequence, sequence position. Default: https://github.com/rhpvorderman/sequali/tree/develop/src/sequali/adapters/adapter_list.tsv"/>
        <section name="overrepresentation-param" title="Overrepresentation parameters" expanded="False">
            <param argument="--overrepresentation-threshold-fraction" name="overrepresentation_threshold" type="float" optional="true" min="0" max="1"
                    label="Fraction to be determined as overrepresented" help="The threshold is calculated as fraction times the number of sampled sequences. Default: 0.001 (1 in 1,000)."/>
            <param argument="--overrepresentation-min-threshold" name="overrep_min_threshold" type="integer" optional="true" min="0"
                    label="Minimum occurrences to be considered overrepresented"
                    help="The minimum amount of occurrences for a sequence to be considered overrepresented, regardless of the bound set by the threshold fraction. Useful for smaller files. Default: 100."/>
            <param argument="--overrepresentation-max-threshold" name="overrep_max_threshold" type="integer" optional="true" min="0"
                    label="Amount of occurrences to be considered overrepresented"
                    help="The amount of occurrences for a sequence to be considered overrepresented, regardless of the bound set by the threshold fraction. Useful for very large files. Default: unlimited."/>
            <param argument="--overrepresentation-max-unique-fragments" name="overrep_max_unique_fragments" type="integer" optional="true" min="0"
                    label="Maximum amount of unique fragments to store"
                    help="Larger amounts increase the sensitivity of finding overrepresented sequences at the cost of increasing memory usage. Default: 5,000,000."/>
            <param argument="--overrepresentation-fragment-length" name="overrep_fragment_length" type="integer" optional="true" min="3" max="31"
                    label="Length of fragments to sample"
                    help="The maximum is 31. Default: 21."/>
            <param argument="--overrepresentation-sample-every" name="overrep_sample_every" type="integer" optional="true"
                    label="Sample one every N sequences"
                    help="More sequences sampled leads to better precision, lower speed, and also more bias towards the beginning of the file as the fragment store gets filled up with more sequences from the beginning. Default is N=8, so one in 8 sequences is analysed."/>
        </section>
        <section name="duplication-param" title="Duplication parameters" expanded="False">
            <param argument="--duplication-max-stored-fingerprints" name="dup_max_stored_fingerprints" type="integer" optional="true" min="0"
                    label="Maximum number of fingerprints stored"
                    help="Determines how many fingerprints are maximally stored to estimate the duplication rate. More fingerprints leads to a more accurate estimate, but also more memory usage. Default: 1,000,000."/>
            <param argument="--fingerprint-front-length" name="fp_front_length" type="integer" optional="true" min="0"
                    label="Number of bases from the front of the sequence"
                    help="Number of bases to be taken for the deduplication fingerprint from the front of the sequence. Default: 8."/>
            <param argument="--fingerprint-back-length" name="fp_back_length" type="integer" optional="true" min="0"
                    label="Number of bases from the back of the sequence"
                    help="Number of bases to be taken for the deduplication fingerprint from the back of the sequence. Default: 8."/>
            <param argument="--fingerprint-front-offset" name="fp_front_offset" type="integer" optional="true" min="0"
                    label="Front offset of the deduplication fingerprint"
                    help="Useful for avoiding adapter sequences. Default: 64 for single end, 0 for paired sequences."/>
            <param argument="--fingerprint-back-offset" name="fp_back_offset" type="integer" optional="true" min="0"
                    label="Back offset of the deduplication fingerprint"
                    help="Useful for avoiding adapter sequences. Default: 64 for single end, 0 for paired sequences."/>
        </section>
    </inputs>
    <outputs>
        <data format="html" name="html_report" label="${tool.name} on ${on_string}: HTML report" />
        <data format="json" name="json_report" label="${tool.name} on ${on_string}: JSON report" />
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <conditional name="input_type">
                <param name="input_type_selector" value="single" />
                <param name="input_reads" value="input_fwd.fastq" />
            </conditional>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <has_text text="Sequali report"/>
                </assert_contents>
            </output>
            <output name="json_report" ftype="json">
                <assert_contents>
                    <has_text text="sequali_version"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <conditional name="input_type">
                <param name="input_type_selector" value="paired" />
                <param name="input_reads" value="input_fwd.fastq" />
                <param name="input_reads_rev" value="input_rev.fastq" />
            </conditional>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <has_text text="Sequali report"/>
                    <has_text text="Filename read 2"/>
                </assert_contents>
            </output>
            <output name="json_report" ftype="json">
                <assert_contents>
                    <has_text text="sequali_version"/>
                    <has_text text="filename_read2"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <conditional name="input_type">
                <param name="input_type_selector" value="single" />
                <param name="input_reads" value="input_nanopore.fastq" />
            </conditional>
            <output name="html_report" ftype="html">
                <assert_contents>
                    <has_text text="Sequali report"/>
                </assert_contents>
            </output>
            <output name="json_report" ftype="json">
                <assert_contents>
                    <has_text text="sequali_version"/>
                </assert_contents>
            </output>
        </test>
        <test expect_failure="true">
            <conditional name="input_type">
                <param name="input_type_selector" value="single" />
                <param name="input_reads" value="input_fail.fastq" />
            </conditional>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**Purpose**

Sequali_ is a tool for fast sequencing data quality metrics for short and long reads.

Features:

- Informative graphs that allow for judging the quality of a sequence at a quick glance.
- Overrepresentation analysis using 21 bp sequence fragments. Overrepresented sequences are checked against the NCBI univec database.
- Estimate duplication rate using a fingerprint subsampling technique which is also used in filesystem duplication estimation.
- Checks for 6 illumina adapter sequences and 17 nanopore adapter sequences for single read data.
- Determines adapters by overlap analysis for paired read data.
- Insert size metrics for paired read data.
- Per tile quality plots for illumina reads.
- Channel and other plots for nanopore reads.
- Reproducible reports without timestamps.

-----

**Supported formats**

- FASTQ. Only the Sanger variation with a phred offset of 33 and the error rate calculation of 10 ^ (-phred/10) is supported. All sequencers use this format today.
    - Paired end sequencing data is supported.
    - For sequences called by illumina base callers an additional plot with the per tile quality will be provided.
    - For sequences called by guppy additional plots for nanopore specific data will be provided.
- (unaligned) BAM with single reads. Read-pair information is currently ignored.
    - For BAM data as delivered by dorado additional nanopore plots will be provided.
    - For aligned BAM files, secondary and supplementary reads are ignored similar to how samtools fastq handles the data.

-----

**Outputs**

Sequali produces informative HTML report with dynamic plots for each quality metric.

.. _Sequali: https://sequali.readthedocs.io/en/latest/
    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioadv/vbaf010</citation>
    </citations>
</tool>
author	iuc
date	Mon, 15 Sep 2025 08:22:10 +0000
parents	abea23542a6b
children