view cpm_tpm_rpk.xml @ 5:bcff1eb6fdb5 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cpm_tpm_rpk commit d6ef3d1c1d490967b7b79138573a86a8d5235c43
author artbio
date Fri, 06 Oct 2023 18:00:05 +0000
parents be358a1ebf67
children d2575928b824
line wrap: on
line source

<tool id="cpm_tpm_rpk" name="Generate CPM, TPM, RPK" version="0.5.1">
    <description>from raw counts expression values</description>
    <requirements>
        <requirement type="package" version="1.7.1">r-optparse</requirement>
        <requirement type="package" version="0.16">r-rtsne</requirement>
        <requirement type="package" version="3.3.6">r-ggplot2</requirement>
        <requirement type="package" version="0.4.14">r-ggfortify</requirement>
        <requirement type="package" version="1.4.4">r-reshape2</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:" level="fatal" description="Tool exception" />
    </stdio>
    <command detect_errors="exit_code"><![CDATA[ 
        Rscript $__tool_directory__/cpm_tpm_rpk.R 
            -d $input 
            -t $option.type_transfo 
            -s $input_sep 
            -c $input_header 
            #if $option.type_transfo == "tpm":
                -f $option.gene_file 
                --gene_sep $option.gene_sep 
                --gene_header $option.gene_header
            #end if 
            #if $option.type_transfo == "rpk":
                -f $option.gene_file 
                --gene_sep $option.gene_sep 
                --gene_header $option.gene_header
            #end if
            -l $log
            #if $log == "FALSE":
                -o ${output}
            #else if $log == "TRUE":
                -o ${output_log}
            #end if
            #if $visu_option.visualisation == "yes":
               --visu 'TRUE'
               #if $visu_option.tsne_labels == "yes":
                   --tsne_labels 'TRUE'
               #else
                   --tsne_labels 'FALSE'
               #end if
               --seed '$visu_option.seed'
               --perp '$visu_option.perp'
               --theta '$visu_option.theta'
               --tsne_out '$tsne_out'
               --pca_out '$pca_out'
            #end if
]]></command>
    <inputs>
        <param name="input" type="data" format="txt" label="Raw counts of expression data"/>
        <param name="input_sep" type="select" label="Input column separator">
            <option value="tab" selected="true">Tabs</option>
            <option value=",">Comma</option>
            <option value="\ ">Whitespace</option>
            <option value=".">Dots</option>
                        <option value="_">Underscores</option>
                        <option value="-">Dashes</option>
        </param>
        <param name="input_header" type="select" label="Consider first line of input file as header?">
            <option value="TRUE" selected="true">Yes</option>
            <option value="FALSE">No</option>
        </param>
            <conditional name="option">
                <param name="type_transfo" type="select" label="Type of transformation">
                    <option value="cpm" selected="true">CPM</option>
                    <option value="tpm">TPM</option>
                    <option value="rpk">RPK</option>
                    <option value="none">NONE</option>
                </param>
                <when value="tpm">
                    <param name="gene_file" type="data" format="txt" label="Gene length file"/> 
                    <param name="gene_sep" type="select" label="Gene length column separator">
                        <option value="tab" selected="true">Tabs</option>
                        <option value=",">Commas</option>
                        <option value="\ ">Whitespaces</option>
                        <option value=".">Dots</option>
                        <option value="_">Underscores</option>
                        <option value="-">Dashes</option>
                    </param>
                    <param name="gene_header" type="select" label="Consider first line of gene length file as header ?">
                        <option value="TRUE" selected="true">Yes</option>
                        <option value="FALSE">No</option>
                    </param>
                </when>
                <when value="rpk">
                    <param name="gene_file" type="data" format="txt" label="Gene length file"/>
                    <param name="gene_sep" type="select" label="Gene length column separator">
                        <option value="tab" selected="true">Tabs</option>
                        <option value=",">Commas</option>
                        <option value="\ ">Whitespaces</option>
                        <option value=".">Dots</option>
                        <option value="_">Underscores</option>
                        <option value="-">Dashes</option>
                    </param>
                    <param name="gene_header" type="select" label="Consider first line of gene length file as header ?">
                        <option value="TRUE" selected="true">Yes</option>
                        <option value="FALSE">No</option>
                    </param>
                </when>
                <when value="cpm">
                </when>
                <when value="none">
                </when>
            </conditional>
        <param name="log" type="select" label="Data should be log transformed ?">
            <option value="FALSE" selected="true">No</option>
            <option value="TRUE">Yes</option>
        </param>
        <conditional name="visu_option">
            <param name="visualisation" type="select" label="Visualisation of data with various dimensionality reduction methods" >
                <option value="no" selected="true">No visualisation</option>
                <option value="yes" >t-SNE and PCA</option>
            </param>
            <when value="yes">
                    <param name="seed" value="49.0" type="float" label="Seed value for reproducibility of t-SNE" help="Set to 49 as default" />
                    <param name="perp" value="10.0" type="float" label="perplexity (t-SNE)" help="should be less than ((nbre observations)-1)/3" /> 
                    <param name="theta" value="1.0" type="float" label="theta (t-SNE)"/>
                    <param name="tsne_labels" type="select" label="Add labels to points plots" >
                        <option value="no" selected="true">No Labels</option>
                        <option value="yes" >Label points</option>
                    </param>
            </when>
            <when value="no" />
        </conditional>
    </inputs>
    <outputs>
        <data name="output" format="tabular" label="${option.type_transfo} from ${on_string}">
            <filter>log == "FALSE"</filter>
        </data>
        <data name="output_log" format="tabular" label="log2(${option.type_transfo} +1) from ${on_string}">
            <filter>log == "TRUE"</filter>
        </data>
        <data name="tsne_out" format="pdf" label="tsne from ${on_string}">
            <filter>visu_option['visualisation'] == 'yes'</filter>
        </data>
        <data name="pca_out" format="pdf" label="PCA from ${on_string}">
            <filter>visu_option['visualisation'] == 'yes'</filter>
        </data>
    </outputs>
    <tests>
        <!-- test t-SNE -->
        <test expect_num_outputs="3">
            <param name="input" value="none.tab" ftype="tabular"/>
            <param name="type_transfo" value="none"/>
            <param name="log" value="FALSE"/>
            <param name="visualisation" value="yes"/>
            <param name="seed" value="49"/>
            <param name="perp" value="10"/>
            <param name="theta" value="1" />
            <param name="tsne_labels" value="yes" />
            <output name="output" file="none.tab" ftype="tabular"  />
            <output name="tsne_out" file="none_tsne.pdf" ftype="pdf" compare="sim_size" delta="1000" />
            <output name="pca_out" file="none_pca.pdf" ftype="pdf" />
        </test>
        <test expect_num_outputs="3">
            <param name="input" value="counts.tab" ftype="tabular"/>
            <param name="type_transfo" value="cpm"/>
            <param name="log" value="TRUE"/>
            <param name="visualisation" value="yes"/>
            <param name="seed" value="49"/>
            <param name="perp" value="2"/>
            <param name="theta" value="1" />
            <param name="tsne_labels" value="yes" />
            <output name="output" file="logcpm.tab" ftype="tabular" />
            <output name="tsne_out" file="tsne.pdf" ftype="pdf" compare="sim_size" delta="1000" />
            <output name="pca_out" file="pca.pdf" ftype="pdf" />
        </test>
        <test expect_num_outputs="3">
            <param name="input" value="counts.tab" ftype="tabular"/>
            <param name="type_transfo" value="tpm"/>
            <param name="gene_file" value="gene_length.tab" ftype="tabular"/>
            <param name="log" value="TRUE"/>
            <param name="visualisation" value="yes"/>
            <param name="seed" value="49"/>
            <param name="perp" value="2"/>
            <param name="theta" value="1" />
            <param name="tsne_labels" value="no" />
            <output name="output" file="logtpm.tab" ftype="tabular" />
            <output name="tsne_out" file="tsne.nolab.pdf" ftype="pdf" compare="sim_size" delta="1000" />
            <output name="pca_out" file="pca.nolab.pdf" ftype="pdf" />
        </test>
        <!-- test without t-SNE -->
        <test expect_num_outputs="1">
            <param name="input" value="counts.tab" ftype="tabular"/>
            <param name="type_transfo" value="cpm"/>
            <output name="output" file="cpm.tab" ftype="tabular"/>
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="counts.tab" ftype="tabular"/>
            <param name="type_transfo" value="cpm"/>
            <param name="log" value="TRUE"/>
            <output name="output" file="logcpm.tab" ftype="tabular" />
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="counts.tab" ftype="tabular"/>
            <param name="type_transfo" value="tpm"/>
            <param name="gene_file" value="gene_length.tab" ftype="tabular"/>
            <param name="gene_header" value="TRUE"/>
            <output name="output" file="tpm.tab" ftype="tabular"/>
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="counts.tab" ftype="tabular"/>
            <param name="type_transfo" value="tpm"/>
            <param name="gene_file" value="gene_length.tab" ftype="tabular"/>
            <param name="gene_header" value="TRUE"/>
            <param name="log" value="TRUE"/>
            <output name="output" file="logtpm.tab" ftype="tabular" />
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="counts.tab" ftype="tabular"/>
            <param name="type_transfo" value="rpk"/>
            <param name="gene_file" value="gene_length.tab" ftype="tabular"/>
            <param name="gene_header" value="TRUE"/>
            <output name="output" file="rpk.tab" ftype="tabular" />
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="counts.tab" ftype="tabular"/>
            <param name="type_transfo" value="rpk"/>
            <param name="gene_file" value="gene_length.tab" ftype="tabular"/>
            <param name="gene_header" value="TRUE"/>
            <param name="log" value="TRUE"/>
            <output name="output" file="logrpk.tab" ftype="tabular" />
        </test>
    </tests>
    <help>

**What it does**

Takes a raw count expression matrix and returns a table of normalized expression values.

Normalization can be:

- CPM (Counts Per Million) are obtained by dividing counts by the library counts sum and multiplying the results by a million.
- RPK (Reads Per Kilobases) are obtained by dividing read counts by gene lengths (expressed in kilo-nucleotides).
- TPM (Transcripts Per Million) are obtained by dividing RPK values by the sum of all RPK values in a sample and multiplying the results by 1 million.

RPK and TPM require a two-column correspondance table gene_name - gene length where the length is specified in nucleotide. Both these metrics are relevant only for sequencing of full length RNAs.

Note: do not comment you header line if you have header line in you tabular input (ie do not start the header line with a '#' character)

Computed values may be log-transformed (log2([CPM or RPK or TPM]+1))

    </help>
    <citations>
        <citation type="bibtex">@Article{,
            title = {Visualizing High-Dimensional Data Using t-SNE},
            volume = {9},
            pages = {2579-2605},
            year = {2008},
            author = {L.J.P. {van der Maaten} and G.E. Hinton},
            journal = {Journal of Machine Learning Research},
            }
        </citation>
        <citation type="bibtex">@Manual{,
            title = {{ClusterR}: Gaussian Mixture Models, K-Means, Mini-Batch-Kmeans,
            K-Medoids and Affinity Propagation Clustering},
            author = {Lampros Mouselimis},
            year = {2019},
            note = {R package version 1.1.9},
            url = {https://github.com/mlampros/ClusterR},
            }
        </citation>
    </citations>
</tool>