view rgFastQC.xml @ 22:3d0c7bdf12f5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fastqc commit b3892d09a9d4d72f3eade0fb2f0b61ab0d9f22e3"
author iuc
date Sun, 12 Sep 2021 11:20:27 +0000
parents e7b2202befea
children 5ec9f6bceaee
line wrap: on
line source

<tool id="fastqc" name="FastQC" version="0.73+galaxy0">
    <description>Read Quality reports</description>
    <xrefs>
        <xref type="bio.tools">fastqc</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="0.11.9">fastqc</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:" level="fatal" description="FastQC returned non zero exit code" />
        <exit_code range=":-1" level="fatal" description="FastQC returned non zero exit code" />
    <regex match="There is insufficient memory for the Java Runtime Environment"
           source="stdout"
           level="fatal_oom"
           description="Out of memory error occurred" />
    </stdio>
    <command><![CDATA[
        #import re
        #set input_name = re.sub('[^\w\-\s]', '_', str($input_file.element_identifier))

        #if $input_file.ext.endswith('.gz'):
            #set input_file_sl = $input_name + '.gz'
        #elif $input_file.ext.endswith('.bz2'):
            #set input_file_sl = $input_name + '.bz2'
        #else
            #set input_file_sl = $input_name
        #end if

        #if 'bam' in $input_file.ext:
            #set format = 'bam'
        #elif 'sam' in $input_file.ext:
            #set format = 'sam'
        #else
            #set format = 'fastq'
        #end if

        ln -s '${input_file}' '${input_file_sl}' &&
        mkdir -p '${html_file.files_path}' &&
        fastqc
            --outdir '${html_file.files_path}'
            #if $contaminants.dataset and str($contaminants) > ''
                --contaminants '${contaminants}'
            #end if

            #if $adapters.dataset and str($adapters) > ''
                --adapters '${adapters}'
            #end if

            #if $limits.dataset and str($limits) > ''
                --limits '${limits}'
            #end if
            --threads \${GALAXY_SLOTS:-2}
            --quiet
            --extract
            #if $min_length:
                --min_length $min_length
            #end if
            $nogroup
            --kmers $kmers
            -f '${format}'
            '${input_file_sl}'

        && cp '${html_file.files_path}'/*/fastqc_data.txt output.txt
        && cp '${html_file.files_path}'/*\.html output.html

    ]]></command>
    <inputs>
        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="input_file" type="data"
               label="Raw read data from your current history" />
        <param name="contaminants" type="data" format="tabular" optional="true" label="Contaminant list"
               help="tab delimited file with 2 columns: name and sequence.  For example: Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA" />
        <param argument="--adapters" type="data" format="tabular" optional="true" label="Adapter list"
               help="List of adapters adapter sequences which will be explicity searched against the library. It should be a tab-delimited file with 2 columns: name and sequence." />
        <param name="limits" type="data" format="txt" optional="true" label="Submodule and Limit specifing file"
               help="a file that specifies which submodules are to be executed (default=all) and also specifies the thresholds for the each submodules warning parameter" />
        <param argument="--nogroup" type="boolean" truevalue="--nogroup" falsevalue="" checked="False"
               label="Disable grouping of bases for reads >50bp" help="Using this option will cause fastqc to crash and burn if you use it on really long reads, and your plots may end up a ridiculous size. You have been warned!"/>
        <param argument="--min_length" type="integer" value="" optional="true"
               label="Lower limit on the length of the sequence to be shown in the report"
               help=" As long as you set this to a value greater or equal to your longest read length then this will be the sequence length used to create your read groups.  This can be useful for making directly comaparable statistics from datasets with somewhat variable read lengths."/>
        <param argument="--kmers" type="integer" value="7" min="2" max="10"
               label="Length of Kmer to look for" help="Note: the Kmer test is disabled and needs to be enabled using a custom Submodule and limits file"/>
    </inputs>
    <outputs>
        <data format="html" name="html_file" from_work_dir="output.html" label="${tool.name} on ${on_string}: Webpage" />
        <data format="txt" name="text_file"  from_work_dir="output.txt" label="${tool.name} on ${on_string}: RawData" />
    </outputs>
    <tests>
        <test>
            <param name="input_file" value="1000trimmed.fastq" />
            <output name="html_file" file="fastqc_report.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data.txt" ftype="txt"/>
        </test>
        <test>
            <param name="input_file" value="1000trimmed.fastq" />
            <param name="contaminants" value="fastqc_contaminants.txt" ftype="tabular" />
            <output name="html_file" file="fastqc_report_contaminants.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_contaminants.txt" ftype="txt"/>
        </test>
        <test>
            <param name="input_file" value="1000trimmed.fastq" />
            <param name="adapters" value="fastqc_adapters.txt" ftype="tabular" />
            <output name="html_file" file="fastqc_report_adapters.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_adapters.txt" ftype="txt"/>
        </test>
        <test>
            <param name="input_file" value="1000trimmed.fastq" />
            <param name="limits" value="fastqc_customlimits.txt" ftype="txt" />
            <output name="html_file" file="fastqc_report_customlimits.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_customlimits.txt" ftype="txt"/>
        </test>
        <test>
            <param name="input_file" value="1000trimmed.fastq" ftype="fastq" />
            <param name="kmers" value="3" />
            <param name="limits" value="fastqc_customlimits.txt" ftype="txt" />
            <output name="html_file" file="fastqc_report_kmer.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_kmer.txt" ftype="txt"/>
            <assert_command>
                <has_text text="--kmers 3"/>
            </assert_command>
        </test>
        <test>
            <param name="input_file" value="1000trimmed.fastq" />
            <param name="min_length" value="108" />
            <output name="html_file" file="fastqc_report_min_length.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_min_length.txt" ftype="txt"/>
        </test>
        <test>
            <param name="input_file" value="1000trimmed.fastq" ftype="fastq" />
            <param name="nogroup" value="--nogroup" />
            <output name="html_file" file="fastqc_report_nogroup.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_nogroup.txt" ftype="txt"/>
            <assert_command>
                <has_text text="--nogroup"/>
            </assert_command>
        </test>
        <test>
            <param name="input_file" value="hisat_output_1.bam" ftype="bam" />
            <output name="html_file" file="fastqc_report_hisat.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_hisat.txt" ftype="txt"/>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**Purpose**

FastQC aims to provide a simple way to do some quality control checks on raw
sequence data coming from high throughput sequencing pipelines.
It provides a set of analyses which you can use to get a quick
impression of whether your data has any problems of
which you should be aware before doing any further analysis.

The main functions of FastQC are:

- Import of data from BAM, SAM or FastQ/FastQ.gz files (any variant),
- Providing a quick overview to tell you in which areas there may be problems
- Summary graphs and tables to quickly assess your data
- Export of results to an HTML based permanent report
- Offline operation to allow automated generation of reports without running the interactive application

-----

.. class:: infomark

**FastQC**

This is a Galaxy wrapper. It merely exposes the external package FastQC_ which is documented at FastQC_
Kindly acknowledge it as well as this tool if you use it.
FastQC incorporates the Picard-tools_ libraries for SAM/BAM processing.

The contaminants file parameter was borrowed from the independently developed
fastqcwrapper contributed to the Galaxy Community Tool Shed by J. Johnson.
Adaption to version 0.11.2 by T. McGowan.

-----

.. class:: infomark

**Inputs and outputs**

FastQC_ is the best place to look for documentation - it's very good.
A summary follows below for those in a tearing hurry.

This wrapper will accept a Galaxy fastq, fastq.gz, sam or bam as the input read file to check.
It will also take an optional file containing a list of contaminants information, in the form of
a tab-delimited file with 2 columns, name and sequence. As another option the tool takes a custom
limits.txt file that allows setting the warning thresholds for the different modules and also specifies
which modules to include in the output.

The tool produces a basic text and a HTML output file that contain all of the results, including the following:

- Basic Statistics
- Per base sequence quality
- Per sequence quality scores
- Per base sequence content
- Per base GC content
- Per sequence GC content
- Per base N content
- Sequence Length Distribution
- Sequence Duplication Levels
- Overrepresented sequences
- Kmer Content

All except Basic Statistics and Overrepresented sequences are plots.
 .. _FastQC: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/
 .. _Picard-tools: https://broadinstitute.github.io/picard/
    ]]></help>
    <citations>
        <citation type="bibtex">
        @unpublished{andrews_s,
            author = {Andrews, S.},
            keywords = {bioinformatics, ngs, qc},
            priority = {2},
            title = {{FastQC A Quality Control tool for High Throughput Sequence Data}},
            url = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/}
        }
        </citation>
    </citations>
</tool>