view rna_quast.xml @ 6:8e66f695d859 draft default tip

planemo upload for repository https://git.ufz.de/lehmanju/rnaquast commit 7536270b3b51024e516b840728db6e6d0f903f69
author iuc
date Wed, 10 Jan 2024 13:24:53 +0000
parents f89e3c318453
children
line wrap: on
line source

<tool id="rna_quast" name="rnaQUAST" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>A quality assessment tool for De Novo transcriptome assemblies</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro='xrefs'/>
    <expand macro='requirements'/>
    <stdio>
        <regex match="Traceback " source="both" level="fatal" description="rnaQuast failed" />
    </stdio>
    <command detect_errors="exit_code"><![CDATA[
    mkdir -p './complete_reports/' &&
    mkdir -p './fasta_files/' &&
    #import os, re, glob
    #for $i in $transcripts        
        ln -s '$i' '${re.sub('[^\w\-.]', '_', i.element_identifier)}' &&
    #end for
    #if $reference
        #for $rf in $reference
            ln -s '$rf' '${re.sub('[^\w\-.]', '_', rf.element_identifier)}' &&
        #end for
    #end if
    #if $gene_coordinates.selector == "true"
        #for $g in $gene_coordinates.gtf
            ln -s '$g' '${re.sub('[^\w\-.]', '_', g.element_identifier)}' &&
        #end for
    #end if    
    mkdir outputdir &&
    rnaQUAST.py
    --threads \${GALAXY_SLOTS:-8}
    --transcripts
    #for $i in $transcripts
         '${re.sub('[^\w\-.]', '_', i.element_identifier)}'
    #end for
    #if $reads_option.selector == 'paired'
        --left_reads '${reads_option.forward_reads}'
        --right_reads '${reads_option.reverse_reads}'
    #else if $reads_option.selector == 'single'
        --single_reads '${reads_option.single_reads}'
    #end if
    $advanced_options.strand_specific
    #if $reads_alignment
        --reads_alignment '${reads_alignment}'
    #end if
    #if $reference
        -r
        #for $rf in $reference
            '${re.sub('[^\w\-.]', '_', rf.element_identifier)}'
        #end for
    #end if
    #if $gene_coordinates.selector == "true"
        --gtf
        #for $g in $gene_coordinates.gtf
            '${re.sub('[^\w\-.]', '_', g.element_identifier)}'
        #end for
        $gene_coordinates.disable_infer_genes
        $gene_coordinates.disable_infer_transcripts
    #end if
    $advanced_options.prokaryote
    --min_alignment $advanced_options.min_alignment
    $advanced_options.blat

    #if "pdf" not in $output_options.out_sr
        --no_plots
    #end if
    #if $use_busco.selector == 'true'
        --busco 
        #if $use_busco.lineage_conditional.selector == 'cached':
            '${use_busco.lineage_conditional.cached_db.fields.path}'
        #else
            $use_busco.lineage
        #end if
    #end if
    ## $advanced_options.gene_mark
    $advanced_options.meta
    --lower_threshold $advanced_options.lower_threshold
    --upper_threshold $advanced_options.upper_threshold
    -o outputdir

    #if 'gz' in $output_options.out_add
        && tar -czvf results.tar.gz './outputdir'
    #end if
    
    #if len($transcripts) == 1
        #set $path = "/".join(['outputdir',($transcripts[0].element_identifier).split(".")[0]]) + "_output"
        && mv '${path}' './results'
        ## rename .list files to .txt files to make them detectable
        && find './results/' -name "*.list" -exec mv {} {}.txt \;
        && true
        && printf "************ METRICS/TRANSCRIPTS ***************\n" > stats.txt
        && for file_name in ./results/*txt; do printf "\n************ \$file_name ************\n" >> stats.txt 
        && sed 's/^ ==.*/&\n/' \$file_name | tail -q -n +2 "\$file_name" >> stats.txt; 
        done
        && cat stats.txt > $stats
        #if $gene_coordinates.selector == 'true' and $reference
            && mv ./results/*fasta ./fasta_files/
        #end if
    #else
        && mkdir -p './results/'
        #if $gene_coordinates.selector == 'true' and $reference
            #for $i, $transcript in enumerate($transcripts)
                #set $path = "/".join(['outputdir',($transcripts[$i].element_identifier).split(".")[0]]) + "_output"
                && rm -r ./results
                && cp -r $path './results'
                && mv ./results/*fasta './fasta_files/'
            #end for
        #end if
        && find './outputdir/comparison_output' -name "*.list" -exec mv {} {}.txt \;
        && true
        && printf "************ COMPARISON METRICS ***************\n" > stats.txt
        && for file_name in ./outputdir/comparison_output/*txt; do printf "\n************ \$file_name ************\n" >> stats.txt 
        && sed 's/^ ==.*/&\n/' \$file_name | tail -q -n +2 "\$file_name" >> stats.txt; done
        && cat stats.txt > $stats
    #end if
    ]]>    </command>
    <inputs>
        <param argument="--transcripts" type="data" format="fasta" multiple="true" label="Transcripts" help="File(s) with transcripts in FASTA format."/>
        <conditional name="reads_option">
            <param name="selector" type="select" label="Single-end or paired-end reads">
                <option value="" selected="true">Disabled-end</option>
                <option value="single" selected="true">Single-end</option>
                <option value="paired">Paired-end (as individual datasets)</option>
            </param>
            <when value=""/>
            <when value="single">
                <param format="fastq,fastq.gz,fastqsanger,fastqsanger.gz" name="single_reads" type="data" label="RNA-Seq FASTQ/FASTA file"/>
            </when>
            <when value="paired">
                <param name="forward_reads" format="fastq,fastq.gz,fastqsanger ,fastqsanger.gz"  type="data" label="RNA-Seq FASTQ/FASTA file, forward reads"/>
                <param name="reverse_reads" format="fastq,fastq.gz,fastqsanger, fastqsanger.gz"  type="data" label="RNA-Seq FASTQ/FASTA file, reverse reads"/>
            </when>
        </conditional>
        <param argument="--reference" type="data" format="fasta" label="Reference genome"  multiple="true" optional="true" help="File with reference genome containing all chromosomes/scaffolds in FASTA forma." />
        <conditional name="gene_coordinates">
            <param name="selector" type="select" label="Genome annotation" help="Genome annotation file. We recommend to use files downloaded from GENCODE or Ensembl.">
                <option value="true">Enabled</option>
                <option value="false" selected="true">Disabled</option>
            </param>
            <when value="true">
                <param argument="--gtf" type="data" format="gtf,gff,gff3" multiple="true" label="GTF/GFF file" />
                <param argument="--disable_infer_genes" type="boolean" truevalue="--disable_infer_genes" falsevalue="" checked="false" label=" Disable infer genes" 
                    help="Use this option if your GTF file already contains genes records, otherwise gffutils will fix it. Note that gffutils may work for quite a long time"/>
                <param argument="--disable_infer_transcripts" type="boolean" truevalue="--disable_infer_transcripts" falsevalue="" checked="false" label="Disable infer transcripts" help="Is option if your GTF file already contains transcripts records, otherwise gffutils will fix it."/>
            </when>
            <when value="false">
            </when>
        </conditional>
        <param argument="--reads_alignment" type="data" format="sam" label="Aligned reads to reference genome" optional="true" help="File with read alignments to the reference genome" />
        <conditional name="use_busco">
          <param argument="selector" type="select" label="Run BUSCO" help="BUSCO allows to detect core genes in the assembled transcripts">
              <option value="false">Disabled</option>
              <option value="true">Enabled</option>
          </param>
          <when value="false"/>
          <when value="true">
            <conditional name="lineage_conditional">
                <param name="selector" type="select" label="Lineage data source">
                    <option value="download">Download lineage data</option>
                    <option value="cached" selected="true">Use cached lineage data</option>
                </param>
                <when value="cached">
                    <param name="cached_db" label="Cached database with lineage" type="select">
                        <options from_data_table="busco_database">
                            <validator message="No BUSCO database is available" type="no_options" />
                        </options>
                    </param>
                </when>
                <when value="download">
                    <param name="lineage" type="select" label="Lineage" help="Select a lineage for using BUSCO">
                        <option value="metazoa">Metazoa</option>
                        <option value="eukaryota">Eukaryota</option>
                        <option value="arthropoda">Arthropoda</option>
                        <option value="vertebrata">Vertebrata</option>
                        <option value="fungi">Fungi</option>
                        <option value="bacteria">Bacteria</option>
                    </param>
                </when>
            </conditional>
          </when>
        </conditional>
        <section name="advanced_options" title="Advaced options" >
            <param name="strand_specific" argument="-ss" type="boolean" truevalue="-ss" falsevalue="" checked="false" label="Strand-specific RNA-seq data" 
                help="Set if transcripts were assembled using strand-specific RNA-Seq data in order to benefit from knowing whether the transcript originated from the + or - strand"/>
            <param argument="--min_alignment" type="integer" min="0" value="50" label="Minimal alignment length to be used" help="Default value is 50"/>
            <param argument="--blat" type="boolean" truevalue="--blat" falsevalue="" checked="false" label="Run with BLAT instead of GMAP" help="BALT is especially useful for aligning long sequences and gapped mapping, which cannot be performed properly by other fast sequence mappers designed for short reads. " />
            <!-- GeneMarkST is not in Bioconda -->
            <!--param argument="-\-gene_mark" type="boolean" truevalue="-\-gene_mark" falsevalue="" checked="false" label="Run with GeneMarkS-T gene prediction tool?" 
                help="GeneMarkS-T allows to predict genes in the assembled transcripts without reference genome"/-->
            <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="false" label="Meta Transcriptome" help="Run quality asessment for meta-transcriptome assemblies" />
            <param argument="--lower_threshold" type="integer" value="50" label="Lower threshold for x-assembled/covered/matched metrics." />
            <param argument="--upper_threshold" type="integer" value="95" label="Upper threshold for x-assembled/covered/matched metrics." />
            <param argument="--prokaryote" type="boolean" truevalue="--prokaryote" falsevalue="" checked="false" label="Prokararyotic organism(s)" help="Use this option if the genome is prokaryotic"/>
        </section>
        <section name="output_options" title="Output options" expanded="true">
            <param name="out_sr" type="select" multiple="true"  display="checkboxes" label="Short report formats">
                <option value="tabular">Tabular</option>
                <option value="tex">TeX</option>
                <option value="pdf" selected="true">PDF</option>
            </param>
            <param name="out_add" type="select" label="Additional outputs" multiple="true" display="checkboxes">
                <option value="complete">Complete report</option>
                <option value="fasta" >FASTA files</option>
                <option value="logs">Logs</option>
                <option value="gz">Compressed output folder</option>
            </param>
        </section>
    </inputs>
    <outputs>
        <data name="stats" format="txt" label="${tool.name} on ${on_string}: complete report">
            <filter>output_options['out_add'] and "complete" in  output_options['out_add']</filter>
        </data>
        <collection name="list_logs" type="list" label="${tool.name} on ${on_string}: logs">
            <discover_datasets ext="txt" pattern="(?P&lt;name&gt;.+)\.log" directory="outputdir/logs" visible="false" />
            <filter>output_options['out_add'] and "logs" in  output_options['out_add']</filter>
        </collection>
        <collection name="fasta_files" type="list" label="${tool.name} on ${on_string}: FASTA files">
            <discover_datasets ext="fasta" pattern="(?P&lt;name&gt;.+)\.fasta" directory="fasta_files" visible="false" />
            <filter>output_options['out_add'] and "fasta" in  output_options['out_add']</filter>
            <filter>gene_coordinates['selector'] == 'true'</filter>
            <filter>reference</filter>
        </collection>
        <data name="compressed_files" format="tgz" label="${tool.name} on ${on_string}: compressed results folder" from_work_dir="results.tar.gz">
            <filter>output_options['out_add'] and "gz" in output_options['out_add']</filter>
        </data>
        <data name="short_report_pdf" format="pdf" label="${tool.name} on ${on_string}: short report (pdf)" from_work_dir="outputdir/short_report.pdf">
            <filter>output_options['out_sr'] and "pdf" in output_options['out_sr']</filter>
        </data>
        <data name="short_report_tex" format="txt" label="${tool.name} on ${on_string}: short report (tex)" from_work_dir="outputdir/short_report.tex">
            <filter>output_options['out_sr'] and "tex" in output_options['out_sr']</filter>
        </data>
        <data name="short_report_tabular" format="tabular" label="${tool.name} on ${on_string}: short report (tabular)" from_work_dir="outputdir/short_report.tsv">
            <filter>output_options['out_sr'] and "tabular" in output_options['out_sr']</filter>
        </data>
    </outputs>
    <tests>
        <!-- Test 01: Minimum input txt output-->
        <test expect_num_outputs="1">
            <param name="transcripts" value="transcriptome01.fasta"/>
            <section name="output_options">
                <param name="out_sr" value="tabular"/>
            </section>
            <output name="short_report_tabular" file="test_01_short_report.tab"/>
        </test>
        <!-- Test 02: Transcriptome reference,single read,  txt output-->
        <test expect_num_outputs="1">
            <param name="transcripts" value="transcriptome01.fasta"/>
            <section name="output_options">
                <param name="out_sr" value="tabular"/>
            </section>
            <conditional name="reads_option">
                <param name="selector" value="single"/>
                <param name="single_reads" value="single_end.fastq.gz"/>
            </conditional>
            <output name="short_report_tabular">
                <assert_contents>
                    <has_text text="Transcripts" />
                    <has_size value="95" delta="5"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test 03: Transcriptome reference and annotation, txt output-->
        <test expect_num_outputs="1">
            <param name="transcripts" value="transcriptome01.fasta"/>
            <conditional name="gene_coordinates">
                <param name="selector" value="true"/>
                <param name="gtf" value="reference.gtf"/>
            </conditional>
            <section name="output_options">
                <param name="out_sr" value="tabular"/>
            </section>
            <conditional name="reads_option">
                <param name="selector" value="single"/>
                <param name="single_reads" value=""/>
            </conditional>
            <output name="short_report_tabular" file="test_03_short_report.tab"/>
        </test>
        <!-- Test 04: Transcriptome reference and annotation, txt output-->
        <test expect_num_outputs="1">
            <param name="transcripts" value="transcriptome01.fasta"/>
            <conditional name="gene_coordinates">
                <param name="selector" value="true"/>
                <param name="gtf" value="reference.gtf"/>
            </conditional>
            <section name="output_options">
                <param name="out_sr" value="tabular"/>
            </section>
            <conditional name="reads_option">
                <param name="selector" value="single"/>
                <param name="single_reads" value="single_end.fastq.gz"/>
            </conditional>
            <output name="short_report_tabular">
                <assert_contents>
                    <has_text text="Transcripts" />
                    <has_size value="140" delta="5"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test 05: Transcriptome reference, annotation and mapping, txt output-->
        <test expect_num_outputs="1">
            <param name="transcripts" value="transcriptome01.fasta"/>
            <conditional name="gene_coordinates">
                <param name="selector" value="true"/>
                <param name="gtf" value="reference.gtf"/>
            </conditional>
            <section name="output_options">
                <param name="out_sr" value="tabular"/>
            </section>
            <conditional name="reads_option">
                <param name="selector" value='paired'/>
                <param name="forward_reads" value="input_F.fastqsanger"/>
                <param name="reverse_reads" value="input_F.fastqsanger"/>
            </conditional>
            <output name="short_report_tabular">
                <assert_contents>
                    <has_text text="Transcripts" />
                    <has_size value="140" delta="5"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test 06: Transcriptome reference, annotation, mapping and BUSCO, txt output-->
        <test expect_num_outputs="1">
            <param name="transcripts" value="transcriptome01.fasta"/>
            <conditional name="gene_coordinates">
                <param name="selector" value="true"/>
                <param name="gtf" value="reference.gtf"/>
            </conditional>
            <conditional name="reads_option">
                <param name="selector" value='paired'/>
                <param name="forward_reads" value="input_F.fastqsanger"/>
                <param name="reverse_reads" value="input_R.fastqsanger"/>
            </conditional>
            <section name="output_options">
                <param name="out_sr" value="tabular"/>
            </section>
            <conditional name="use_busco">
                <param name="selector" value="true"/>
                <conditional name="lineage_conditional">
                    <param name="selector" value="cached"/>
                    <param name="cached_db" value="busco-demo-db-20230328"/>
                </conditional>
            </conditional>
            <output name="short_report_tabular">
                <assert_contents>
                    <has_text text="Transcripts" />
                    <has_size value="140" delta="5"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test 07: Transcriptome reference, annotation, mapping and BUSCO, additional outputs-->
        <test expect_num_outputs="4">
            <param name="transcripts" value="transcriptome01.fasta"/>
            <conditional name="gene_coordinates">
                <param name="selector" value="true"/>
                <param name="gtf" value="reference.gtf"/>
            </conditional>
            <param name="reference" value="reference.fasta"/>
            <conditional name="reads_option">
                <param name="selector" value='paired'/>
                <param name="forward_reads" value="input_F.fastqsanger"/>
                <param name="reverse_reads" value="input_R.fastqsanger"/>
            </conditional>
            <conditional name="use_busco">
                <param name="selector" value="true"/>
                <conditional name="lineage_conditional">
                    <param name="selector" value="cached"/>
                    <param name="cached_db" value="busco-demo-db-20230328"/>
                </conditional>
            </conditional>
             <section name="output_options">
                <param name="out_sr" value="pdf,tabular"/>
                <param name="out_add" value="fasta,gz"/>
            </section>
            <output_collection name="fasta_files" type="list" count="7">
                <element name="transcriptome01.paralogs" file="test_07_paralogs.fasta" ftype="fasta"/>
            </output_collection>
            <output name="short_report_pdf" file="test_07_short_report.pdf" ftype="pdf" compare="sim_size" delta="1000"/>
            <output name="short_report_tabular" file="test_07_short_report.tab" ftype="tabular"/>
            <output name="compressed_files" ftype="tgz">
                <assert_contents>
                    <has_archive_member path="./outputdir/.*" n="62" delta="0"/>
                    <has_archive_member path="./outputdir/logs/.*" n="10" delta="0"/>
                    <has_archive_member path="./outputdir/transcriptome01_output/.*" n="45" delta="0"/>
                </assert_contents>
            </output>         
        </test>
        <!-- Test 08: Multiple inputs-->
        <test expect_num_outputs="6">
            <param name="transcripts" value="transcriptome01.fasta,transcriptome02.fasta"/>
            <param name="reference" value="reference.fasta"/>
            <conditional name="gene_coordinates">
                <param name="selector" value="true"/>
                <param name="gtf" value="reference.gtf"/>
            </conditional>
            <section name="output_options">
                <param name="out_sr" value="tabular,pdf"/>
            </section>
            <conditional name="use_busco">
                <param name="selector" value="true"/>
                <conditional name="lineage_conditional">
                    <param name="selector" value="cached"/>
                    <param name="cached_db" value="busco-demo-db-20230328"/>
                </conditional>
            </conditional>
            <param name="out_add" value="complete,fasta,logs,gz"/>
            <conditional name="reads_option">
                <param name="selector" value="single"/>
                <param name="single_reads" value="single_end.fastq.gz"/>
            </conditional>
            <output name="short_report_tabular" value="test_08_short_report.tab" ftype="tabular"/>
            <output name="short_report_pdf" value="test_08_short_report.pdf" ftype="pdf"/>
            <output name="stats" value="test_08_complete_report.tab" ftype="txt"  lines_diff="6" />
            <output_collection name="fasta_files" type="list" count="14">
                <element name="transcriptome01.paralogs" file="test_08_paralogs.fasta" ftype="fasta"/>
            </output_collection>
            <output_collection name="list_logs" type="list" count="14">
                <element name="STAR.out" ftype="txt">
                    <assert_contents>
                        <has_text text="STAR --runThreadN"/>
                        <has_text text="finished successfully"/>
                    </assert_contents>
                </element>
                <element name="gmap_build.out" ftype="txt">
                    <assert_contents>
                        <has_text text="No alternate scaffolds observed"/>
                    </assert_contents>
                </element>
                <element name="rnaQUAST" ftype="txt">
                    <assert_contents>
                        <has_text text="THE QUALITY OF TRANSCRIPTOME ASSEMBLY DONE"/>
                        <has_text text="Thank you for using rnaQUAST!"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>

    </tests>
    <help><![CDATA[

.. class:: infomark

**Purpose**

rnaQUAST is a tool for evaluating RNA-Seq assemblies using reference genome and gene database. In addition, rnaQUAST is also capable 
of estimating gene database coverage by raw reads and de novo quality assessment.

.. class:: infomark

**rnaQUAST pipeline**

To evaluate quality of the assembled transcripts, rnaQUAST takes a reference genome in FASTA format and optionally its gene database in 
GFF/GTF format. A user can provide either a FASTA file with transcripts, which will be aligned to the given reference genome using GMAP 
or BLAT. The alignments are analyzed to calculate simple metrics and then are matched against the isoforms from the gene database in order 
to obtain statistics that represent completeness and correctness levels of the assembly. In addition, rnaQUAST is capable of estimating 
gene database coverage by raw reads using STAR or TopHat2. For de novo quality assessment when reference genome and gene database are 
unavailable, the transcripts are analyzed using BUSCO.

.. class:: infomark

**Metrics and alignment analysis**

rnaQUAST calculates various metrics without using alignment information, e.g. length distribution and N50 of the assembled transcripts. 
Additionally, rnaQUAST computes the following statistics for the gene database: the total number of genes and isoforms, isoform and exon 
length distribution, average number of exons per gene, etc.

To analyze transcripts' alignments, rnaQUAST firstly filters out short partial alignments (shorter than a user-defined threshold, default 
value is 50 bp). Such short alignments are typically caused by genomic repeats and thus are ignored. Afterwards, rnaQUAST selects the 
best-scored spliced alignment for each transcript. If a transcript has more than one alignment with the highest score, it is reported 
as multiply aligned. Otherwise, it is considered to be uniquely aligned. If the best-scored alignment is discordant (e.g. the transcript 
has partial alignments that are either mapped to different strands or to different chromosomes) the transcript is classified as misassembled. 
Transcripts without misassemblies are analyzed to calculate such metrics as average transcript alignment fraction and mismatch rate.

For the simplicity of explanation, transcript is further referred to as a sequence generated by the assembler and isoform denotes a sequence 
from the gene database. rnaQUAST matches best-scored alignments of non-misassembled transcripts to the isoforms' coordinates and analyzes 
them to estimate how well the isoforms are covered by the assembly. rnaQUAST computes such metrics as database coverage (the total number 
of covered bases of all isoforms divided by the total length of all isoforms) and the number of 50%/95%-assembled isoforms. An isoform is 
considered to be x%-assembled if it has at least x% covered by a single transcript. Vice versa, to evaluate how well the assembled 
transcripts are covered by the isoforms, rnaQUAST estimates the number of unannotated transcripts (that align to the genome, but do not 
match to any isoform) and the number of 50%/95%-matched transcripts (that have corresponding fraction mapped to an isoform). Indeed, the 
thresholds described above (50% and 95%) can be varied by the user.


    ]]>    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btw218 </citation>
    </citations>
</tool>