view tetranscript.xml @ 4:8e96d712a2bd draft default tip

planemo upload for repository https://github.com/mhammell-laboratory/TEtranscripts commit 3de50462b3bbe717953c65a1c00935999327a4f0
author iuc
date Thu, 03 Nov 2022 18:17:35 +0000
parents 313414e9abcf
children
line wrap: on
line source

<?xml version="1.0"?>
<tool id="tetoolkit_tetranscripts" name="TEtranscripts" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>annotates reads to genes and transposable elements</description>
    <macros>
        <token name="@TOOL_VERSION@">2.2.3</token>
        <token name="@VERSION_SUFFIX@">0</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">tetranscripts</requirement>
    </requirements>
    <version_command>TEtranscripts --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
## initialize
## file extension is required
ln -s '$GTF' 'gene_annotation.gtf' &&
ln -s '$TE' 'transposable_annotation.gtf' &&

## run
TEtranscripts
## required
-t
#for $s in $sample_rep
    '${s.t}'
#end for
-c
#for $s in $sample_rep
    '${s.c}'
#end for
--GTF 'gene_annotation.gtf'
--TE 'transposable_annotation.gtf'
## optional
--stranded '$io.stranded'
$io.sortByPos
--project 'result'
--mode '$ap.mode'
--minread $ap.minread
#if $ap.fragmentLength
    --fragmentLength $ap.fragmentLength
#end if
--iteration $ap.iteration
--padj $ap.padj
--foldchange $ap.foldchange
#if 'log' in $oo.out
    --verbose 3
    |& tee '$out_log'
#end if
        ]]></command>
    <inputs>
        <repeat name="sample_rep" min="2" title="Select input data">
            <param argument="-t" type="data" format="bam" label="Treatment sample file"/>
            <param argument="-c" type="data" format="bam" label="Control sample file"/>
        </repeat>
        <param argument="--GTF" type="data" format="gtf" label="Select file for gene annotations"/>
        <param argument="--TE" type="data" format ="gtf" label="Select file for transposable element annotations"/>
        <section name="io" title="Input options" expanded="true">
            <param argument="--stranded" type="select" label="Select library type">
                <option value="no">Library is unstranded (no)</option>
                <option value="forward">Second-strand cDNA library e.g. QIAseq stranded (forward)</option>
                <option value="reverse">First-strand cDNA library e.g. Illumina TruSeq stranded (reverse)</option>
            </param>
            <param argument="--sortByPos" type="boolean" truevalue="--sortByPos" falsevalue="" label="Are input files sorted by chromosome position?"/>
        </section>
        <section name="ap" title="Advanced parameters">
            <param argument="--mode" type="select" label="Set TE counting mode">
                <option value="multi">Distribute among all alignments (multi)</option>
                <option value="uniq">Unique mappers only (uniq)</option>
            </param>
            <param argument="--minread" type="integer" value="1" min="0" label="Set read count cutoff"/>
            <param argument="--fragmentLength" type="integer" min="0" optional="true" label="Set average length of fragment used for single-end sequencing" help="For paired-end, estimated from the input alignment file. For single-end, ignored by default."/>
            <param argument="--iteration" type="integer" value="100" min="0" label="Set maximum number of iterations used to optimize multi-reads assignment"/>
            <param argument="--padj" type="float" value="0.05" min="0.0" max="1.0" label="Set FDR cutoff for significance"/>
            <param argument="--foldchange" type="float" value="1.0" min="0.0" label="Set fold-change ratio (absolute) cutoff for differential expression"/>
        </section>
        <section name="oo" title="Output options">
            <param name="out" type="select" multiple="true" optional="false" label="Select output file(s)" help="Result files for gene TE analysis and sigDiff gene TE will be created if more than one dataset is applied.">
                <option value="cnttable" selected="true">cntTable</option>
                <option value="deseq2">DESeq2.R</option>
                <option value="gta" selected="true">Gene TE Analysis</option>
                <option value="log">Log</option>
                <option value="sgt" selected="true">SigDiff Gene TE</option>
            </param>
        </section>
    </inputs>
    <outputs>
        <data name="out_cnt" format="tabular" from_work_dir="result.cntTable" label="${tool.name} on ${on_string}: cntTable">
            <filter>'cnttable' in oo['out']</filter>
        </data>
        <data name="out_deseq2" format="txt" from_work_dir="result_DESeq2.R" label="${tool.name} on ${on_string}: DESeq2.R">
            <filter>'deseq2' in oo['out']</filter>
        </data>
        <data name="out_gta" format="tabular" from_work_dir="result_gene_TE_analysis.txt" label="${tool.name} on ${on_string}: Gene TE analysis">
            <filter>'gta' in oo['out']</filter>
        </data>
        <data name="out_log" format="txt" label="${tool.name} on ${on_string}: Log">
            <filter>'log' in oo['out']</filter>
        </data>
        <data name="out_sgt" format="tabular" from_work_dir="result_sigdiff_gene_TE.txt" label="${tool.name} on ${on_string}: SigDiff Gene TE">
            <filter>'sgt' in oo['out']</filter>
        </data>
    </outputs>
    <tests>
        <!-- 
            test data sources:
            https://github.com/mhammell-laboratory/tetoolkit-test-data 
            https://github.com/mhammell-laboratory/TEtranscripts/issues/66
        -->

        <!-- #1: default -->
        <test expect_num_outputs="3">
            <repeat name="sample_rep">
                <param name="t" value="treatment1.bam"/>
                <param name="c" value="control1.bam"/>
            </repeat>
            <repeat name="sample_rep">
                <param name="t" value="treatment2.bam"/>
                <param name="c" value="control2.bam"/>
            </repeat>
            <param name="GTF" value="gtf.gtf"/>
            <param name="TE" value="te.gtf"/>
            <output name="out_cnt">
                <assert_contents>
                    <has_n_lines n="295"/>
                    <has_text_matching expression="gene.+"/>
                    <has_text_matching expression="TIRANT.+"/>
                </assert_contents>
            </output>
            <output name="out_gta">
                <assert_contents>
                    <has_n_lines n="71"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                    <has_text_matching expression="TIRANT.+"/>
                </assert_contents>
            </output>
            <output name="out_sgt">
                <assert_contents>
                    <has_n_lines n="1"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                </assert_contents>
            </output>
        </test>
        <!-- #2 -->
        <test expect_num_outputs="5">
            <repeat name="sample_rep">
                <param name="t" value="treatment1.bam"/>
                <param name="c" value="control1.bam"/>
            </repeat>
            <repeat name="sample_rep">
                <param name="t" value="treatment2.bam"/>
                <param name="c" value="control2.bam"/>
            </repeat>
            <param name="GTF" value="gtf.gtf"/>
            <param name="TE" value="te.gtf"/>
            <section name="io">
                <param name="stranded" value="forward"/>
                <param name="sortByPos" value="true"/>
            </section>
            <section name="ap">
                <param name="mode" value="uniq"/>
                <param name="minread" value="2"/>
                <param name="fragmentLength" value="10"/>
                <param name="iteration" value="90"/>
                <param name="padj" value="0.06"/>
                <param name="foldchange" value="2.0"/>
            </section>
            <section name="oo">
                <param name="out" value="cnttable,deseq2,gta,sgt,log"/>
            </section>
            <output name="out_cnt">
                <assert_contents>
                    <has_n_lines n="295"/>
                    <has_text_matching expression="gene.+"/>
                    <has_text_matching expression="TIRANT.+"/>
                </assert_contents>
            </output>
            <output name="out_deseq2">
                <assert_contents>
                    <has_n_lines n="14"/>
                    <has_text_matching expression="data.+"/>
                </assert_contents>
            </output>
            <output name="out_gta">
                <assert_contents>
                    <has_n_lines n="3"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                    <has_text_matching expression="DNAREP1.+"/>
                </assert_contents>
            </output>
            <output name="out_log">
                <assert_contents>
                    <has_text_matching expression="INFO.+"/>
                </assert_contents>
            </output>
            <output name="out_sgt">
                <assert_contents>
                    <has_n_lines n="1"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                </assert_contents>
            </output>
        </test>
        <!-- #3 -->
        <test expect_num_outputs="5">
            <repeat name="sample_rep">
                <param name="t" value="treatment1.bam"/>
                <param name="c" value="control1.bam"/>
            </repeat>
            <repeat name="sample_rep">
                <param name="t" value="treatment2.bam"/>
                <param name="c" value="control2.bam"/>
            </repeat>
            <param name="GTF" value="gtf.gtf"/>
            <param name="TE" value="te.gtf"/>
            <section name="io">
                <param name="stranded" value="reverse"/>
            </section>
            <section name="oo">
                <param name="out" value="cnttable,deseq2,gta,sgt,log"/>
            </section>
            <output name="out_cnt">
                <assert_contents>
                    <has_n_lines n="295"/>
                    <has_text_matching expression="gene.+"/>
                    <has_text_matching expression="TIRANT.+"/>
                </assert_contents>
            </output>
            <output name="out_deseq2">
                <assert_contents>
                    <has_n_lines n="14"/>
                    <has_text_matching expression="data.+"/>
                </assert_contents>
            </output>
            <output name="out_gta">
                <assert_contents>
                    <has_n_lines n="23"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                    <has_text_matching expression="TART.+"/>
                </assert_contents>
            </output>
            <output name="out_log">
                <assert_contents>
                    <has_text_matching expression="INFO.+"/>
                </assert_contents>
            </output>
            <output name="out_sgt">
                <assert_contents>
                    <has_n_lines n="2"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                    <has_text_matching expression="Gypsy12.+"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

TEtranscripts is a software package that utilizes both unambiguously (uniquely) and ambiguously (multi-) mapped reads to perform differential enrichment analyses from high throughput sequencing experiments. Currently, most expression analysis software packates are not optimized for handling the complexities involved in quantifying highly repetitive regions of the genome, especially transposable elements (TE), from short sequencing reads. Although transposon elements make up between 20 to 80% of many eukaryotic genomes and contribute significantly to the cellular transcriptome output, the difficulty in quantifying their abundances from high throughput sequencing experiments has led them to be largely ignored in most studies. The TEtranscripts provides a noticeable improvement in the recovery of TE transcripts from RNA-Seq experiments and identification of peaks associated with repetitive regions of the genome.

**Input**

GTF files for gene annotation can be obtained from `UCSC RefSeq <http://genome.ucsc.edu/cgi-bin/hgTables>`_, Ensembl, `iGenomes <http://support.illumina.com/sequencing/sequencing_software/igenome.html>`_ or other annotation databases. GTF files for TE annotations are customly generated from `UCSC RepeatMasker <http://genome.ucsc.edu/cgi-bin/hgTables>`_ or other annotation database. They contain two custom attributes, class_id and family_id, corresponding to the class (e.g. LINE) and family (e.g. L1) of the corresponding transposable element. A unique ID (e.g. L1Md_Gf_dup1) is also assigned for each TE annotation in the transcript_id attribute.

**Output**

TEtranscripts quantifies both gene and transposable element (TE) transcript abundances from RNA-Seq experiments, utilizing both uniquely and ambiguously mapped short read sequences. It processes the short reads alignments (BAM files) and proportionally assigns read counts to the corresponding gene or TE based on the user-provided annotation files (GTF files). In addition, TEtranscripts combines multiple libraries and perform differential analysis using DESeq2.

.. class:: infomark

**References**

More information are available on the `project website <http://hammelllab.labsites.cshl.edu/software/#TEtranscripts>`_ and `GitHub <https://github.com/mhammell-laboratory/TEtranscripts>`_.
    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btv422</citation>
        <citation type="doi">10.1007/978-1-4939-7710-9_11</citation>
    </citations>
</tool>