Mercurial > repos > artbio > gsc_high_dimensions_visualisation
diff high_dim_visu.xml @ 0:cad0001b9cfb draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author | artbio |
---|---|
date | Mon, 24 Jun 2019 13:39:11 -0400 |
parents | |
children | c756ab726a85 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/high_dim_visu.xml Mon Jun 24 13:39:11 2019 -0400 @@ -0,0 +1,376 @@ +<tool id="high_dimensions_visualisation" name="Generate PCA, tSNE and HCPC" version="0.9.0"> + <description>from highly dimensional expression data</description> + <requirements> + <requirement type="package" version="1.3.2=r3.3.2_0">r-optparse</requirement> + <requirement type="package" version="1.39=r3.3.2_0">r-factominer</requirement> + <requirement type="package" version="1.0.5=r3.3.2_0">r-factoextra</requirement> + <requirement type="package" version="0.13=r3.3.2_0">r-rtsne</requirement> + <requirement type="package" version="2.2.1=r3.3.2_0">r-ggplot2</requirement> + <requirement type="package" version="0.4.1=r3.3.2_0">r-ggfortify</requirement> + </requirements> + <stdio> + <exit_code range="1:" level="fatal" description="Tool exception" /> + </stdio> + <command detect_errors="exit_code"><![CDATA[ + Rscript $__tool_directory__/high_dim_visu.R + --data '$input' + --sep '$input_sep' + --colnames '$input_header' + #if $factor_condition.factor_choice == 'Yes' + --factor '$factor_condition.factor' + #end if + #if $labels == "yes": + --labels 'TRUE' + #else + --labels 'FALSE' + #end if + #if $coord == "yes": + --table_coordinates '$table_coordinates' + #end if + --visu_choice '$visualisation.visu_choice' + #if $visualisation.visu_choice == "tSNE": + --Rtsne_seed '$visualisation.Rtsne_seed' + --Rtsne_perplexity '$visualisation.Rtsne_perplexity' + --Rtsne_theta '$visualisation.Rtsne_theta' + --Rtsne_max_iter '$visualisation.Rtsne_max_iter' + --Rtsne_dims '$visualisation.Rtsne_dims' + --Rtsne_initial_dims '$visualisation.Rtsne_initial_dims' + --Rtsne_pca '$visualisation.Rtsne_pca' + --Rtsne_pca_center '$visualisation.Rtsne_pca_center' + --Rtsne_pca_scale '$visualisation.Rtsne_pca_scale' + --Rtsne_normalize '$visualisation.Rtsne_normalize' + --Rtsne_exaggeration_factor '$visualisation.Rtsne_exaggeration_factor' + #end if + + #if $visualisation.visu_choice == "HCPC": + --HCPC_ncluster '$visualisation.HCPC_ncluster' + --HCPC_npc '$visualisation.HCPC_npc' + --HCPC_metric '$visualisation.HCPC_metric' + --HCPC_method '$visualisation.HCPC_method' + --HCPC_consol '$visualisation.HCPC_consol' + --HCPC_itermax '$visualisation.HCPC_itermax' + --HCPC_min '$visualisation.HCPC_min' + --HCPC_max '$visualisation.HCPC_max' + --HCPC_clusterCA '$visualisation.HCPC_clusterCA' + --HCPC_kk '$visualisation.HCPC_kk' + #end if + + #if $visualisation.visu_choice == "PCA": + --PCA_npc '$visualisation.PCA_npc' + #end if + + + --pdf_out '$pdf_out' + +]]></command> + <inputs> + <param name="input" type="data" format="txt,tabular" label="expression data"/> + <param name="input_sep" type="select" label="Input column separator"> + <option value="tab" selected="true">Tabs</option> + <option value=",">Comma</option> + </param> + <param name="input_header" type="select" label="Consider first line of input file as header?"> + <option value="TRUE" selected="true">Yes</option> + <option value="FALSE">No</option> + </param> + <param name="labels" type="select" label="Add sample labels to scatter plot" > + <option value="no" selected="true">No Labels</option> + <option value="yes" >Label points</option> + </param> + <conditional name="factor_condition"> + <param label="Do you wish to contrast cells with a factor" name="factor_choice" type="select"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="factor" type="data" format="tabular" label="Factor to constrast data" + help="A two-column data frame, first column contains data labels, second column contains the levels of a factor to contrast visualisation" /> + </when> + <when value="No"> + </when> + </conditional> + <conditional name="visualisation"> + <param label="Choose visualisation method" name="visu_choice" type="select"> + <option value="PCA" selected="True">PCA</option> + <option value="HCPC">HCPC</option> + <option value="tSNE">t-SNE</option> + </param> + <when value="tSNE"> + <param name="Rtsne_seed" value="42" type="integer" label="Seed value for reproducibility of t-SNE" help="Set to 42 as default" /> + <param name="Rtsne_dims" value="2" type="integer" label="dims (t-SNE)" help="Output dimensionality (should not be greater than 3)" /> + <param name="Rtsne_pca" type="select" label="pca (t-SNE)" help="Whether an initial PCA step should be performed" > + <option value="TRUE" selected="true">Yes</option> + <option value="FALSE">False</option> + </param> + <param name="Rtsne_initial_dims" value="50" type="integer" label="initial dims (t-SNE)" help="The number of dimensions that should be retained in the initial PCA step" /> + <param name="Rtsne_pca_center" type="select" label="Centering data" help="Should data be centered before pca is applied? " > + <option value="TRUE" selected="true">Yes</option> + <option value="FALSE">False</option> + </param> + <param name="Rtsne_pca_scale" type="select" label="Scalling data" help="Should data be scaled before pca is applied? " > + <option value="TRUE">Yes</option> + <option value="FALSE" selected="true">False</option> + </param> + <param name="Rtsne_normalize" type="select" label="Normalisation of data" + help="Should variables (gene expressions) be normalized internally prior to distance calculations? " > + <option value="TRUE" selected="true">Yes</option> + <option value="FALSE">False</option> + </param> + <param name="Rtsne_perplexity" value="10.0" type="float" label="perplexity (t-SNE)" help="should be less than ((nbr observations)-1)/3" /> + <param name="Rtsne_theta" value="1.0" type="float" label="theta (t-SNE)"/> + <param name="Rtsne_exaggeration_factor" value="12.0" type="float" label="Exageration factor" help="Exaggeration factor used to multiply the P matrix in the first part of the optimization" /> + <param name="Rtsne_max_iter" value="1000" type="integer" label="Number of iterations (default: 1000)" + help="The number of iterations that Rtsne executes to improve low dim representation (gradient descent optimization)" /> + </when> + <when value="HCPC"> + <param name="HCPC_npc" value="5" type="integer" label="Number of principal components to keep" + help="The number of dimensions which are kept for HCPC analysis (default=5)" /> + <param name="HCPC_ncluster" value="-1" type="integer" label="Number of clusters in Hierar. Clustering" + help="nb.clust, the number of clusters to consider in the hierarchical clustering. (default : -1, let HCPC to optimize the number)" /> + <param name="HCPC_metric" type="select" label="Dissimilarity metric" help="Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan'? " > + <option value="euclidian" selected="true">euclidian</option> + <option value="manhattan">manhattan</option> + </param> + <param name="HCPC_method" type="select" label="Clustering method" help="Clustering method between 'ward', 'average', 'single', 'complete', 'weighted' " > + <option value="ward" selected="true">ward</option> + <option value="average">average</option> + <option value="single">single</option> + <option value="complete">complete</option> + <option value="weighted">weighted</option> + </param> + <param name="HCPC_consol" type="select" label="k-means consolidation" help="If TRUE, a k-means consolidation is performed" > + <option value="TRUE" selected="true">Yes</option> + <option value="FALSE">False</option> + </param> + <param name="HCPC_itermax" value="10" type="integer" label="Maximum number of iterations for consolidation" + help=" (default=10)" /> + <param name="HCPC_min" value="3" type="integer" label="min number of clusters" + help=" The least possible number of clusters suggested (default=3)" /> + <param name="HCPC_max" value="-1" type="text" label="max number of clusters" + help=" The higher possible number of clusters suggested, by default the minimum between 10 and the number of individuals divided by 2. (default=-1)" /> + <param name="HCPC_clusterCA" type="select" label="clusterCA, Clustering against rows or columns" help="default(rows)" > + <option value="rows" selected="true">Rows</option> + <option value="cols">Columns</option> + </param> + <param name="HCPC_kk" value="-1" type="text" label="kk, Number of clusters used in a Kmeans preprocessing " + help="No k-means consolidation is done if a kk value is provided (default=-1)" /> + </when> + <when value="PCA"> + <param name="PCA_npc" value="5" type="integer" label="Number of principal components to keep" help="The number of dimensions which are kept for PCA analysis (default=5)" /> + </when> + </conditional> + <param label="Return scatter plot table coordinates" name="coord" type="select"> + <option value="no" selected="True">No</option> + <option value="yes">Yes</option> + </param> + + </inputs> + <outputs> + <data name="pdf_out" format="pdf" label="${visualisation.visu_choice} on ${on_string}" /> + <data name="table_coordinates" format="tabular" label="Scatter plot coordinates from ${tool.name} on ${on_string}" > + <filter>coord == 'yes'</filter> + </data> + </outputs> + <tests> + <!-- test PCA --> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="yes" /> + <param name="visu_choice" value="PCA" /> + <param name="factor_choice" value="No" /> + <output name="pdf_out" file="pca.labels.pdf" ftype="pdf"/> + </test> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="no" /> + <param name="visu_choice" value="PCA" /> + <param name="factor_choice" value="No" /> + <output name="pdf_out" file="pca.nolabels.pdf" ftype="pdf"/> + </test> + <!-- test Coordinates tables on PCA --> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="no" /> + <param name="visu_choice" value="PCA" /> + <param name="coord" value="yes" /> + <param name="factor_choice" value="No" /> + <output name="pdf_out" file="pca.nolabels.pdf" ftype="pdf"/> + <output name="table_coordinates" file="pca.coord.tab" ftype="tabular"/> + </test> + <!-- test factor contrasting on PCA --> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="no" /> + <param name="visu_choice" value="PCA" /> + <param name="factor_choice" value="Yes" /> + <param name="factor" value="factor.tsv" ftype="txt"/> + <output name="pdf_out" file="pca.nolabels.factors.pdf" ftype="pdf"/> + </test> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="no" /> + <param name="visu_choice" value="PCA" /> + <param name="factor_choice" value="Yes" /> + <param name="factor" value="shuffled_factor.tsv" ftype="txt"/> + <output name="pdf_out" file="pca.nolabels.factors.pdf" ftype="pdf"/> + </test> + <!-- test HCPC --> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="yes" /> + <param name="visu_choice" value="HCPC" /> + <param name="HCPC_npc" value="5"/> + <param name="HCPC_ncluster" value="-1"/> + <output name="pdf_out" file="hcpc.labels.pdf" ftype="pdf"/> + </test> + <!-- test factor contrasting on HCPC --> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="no" /> + <param name="visu_choice" value="HCPC" /> + <param name="HCPC_npc" value="5"/> + <param name="HCPC_ncluster" value="-1"/> + <param name="factor_choice" value="Yes" /> + <param name="factor" value="factor.tsv" ftype="txt"/> + <output name="pdf_out" file="hcpc.nolabels.factor.pdf" ftype="pdf"/> + </test> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="no" /> + <param name="HCPC_npc" value="5"/> + <param name="HCPC_ncluster" value="-1"/> + <param name="visu_choice" value="HCPC" /> + <output name="pdf_out" file="hcpc.nolabels.pdf" ftype="pdf"/> + </test> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="yes" /> + <param name="visu_choice" value="HCPC" /> + <param name="coord" value="yes" /> + <param name="HCPC_method" value="average"/> + <param name="HCPC_metric" value="manhattan"/> + <param name="HCPC_npc" value="4" /> + <output name="pdf_out" file="hcpc-2.labels.pdf" ftype="pdf"/> + <output name="table_coordinates" file="hcpc-2.coord.tab" ftype="tabular"/> + </test> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="yes" /> + <param name="visu_choice" value="HCPC" /> + <param name="coord" value="yes" /> + <param name="HCPC_method" value="single"/> + <param name="HCPC_metric" value="euclidian"/> + <param name="HCPC_npc" value="4" /> + <param name="HCPC_clusterCA" value="cols" /> + <output name="pdf_out" file="hcpc-3.labels.pdf" ftype="pdf"/> + <output name="table_coordinates" file="hcpc-3.coord.tab" ftype="tabular"/> + </test> + <!-- test t-SNE --> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="yes" /> + <param name="visu_choice" value="tSNE" /> + <param name="Rtsne_seed" value="49"/> + <param name="Rtsne_perplexity" value="10"/> + <param name="Rtsne_theta" value="1" /> + <output name="pdf_out" file="tsne.labels.pdf" ftype="pdf"/> + </test> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="no" /> + <param name="visu_choice" value="tSNE" /> + <param name="Rtsne_seed" value="49"/> + <param name="Rtsne_perplexity" value="10"/> + <param name="Rtsne_theta" value="1" /> + <output name="pdf_out" file="tsne.nolabels.pdf" ftype="pdf"/> + </test> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="no" /> + <param name="visu_choice" value="tSNE" /> + <param name="coord" value="yes" /> + <param name="Rtsne_seed" value="42"/> + <param name="Rtsne_perplexity" value="5.0"/> + <param name="Rtsne_theta" value="1.0" /> + <param name="Rtsne_dims" value="3" /> + <param name="Rtsne_exaggeration_factor" value="15.0" /> + <output name="pdf_out" file="tsne-2.nolabels.pdf" ftype="pdf"/> + <output name="table_coordinates" file="tsne-2.coord.tab" ftype="tabular"/> + </test> + <!-- test factor contrasting on t-SNE --> + <test> + <param name="input" value="cpm_input.tsv" ftype="txt"/> + <param name="labels" value="yes" /> + <param name="visu_choice" value="tSNE" /> + <param name="factor_choice" value="Yes" /> + <param name="factor" value="shuffled_factor.tsv" ftype="txt"/> + <param name="Rtsne_seed" value="49"/> + <param name="Rtsne_perplexity" value="10"/> + <param name="Rtsne_theta" value="1" /> + <output name="pdf_out" file="tsne.labels.factor.pdf" ftype="pdf"/> + </test> + </tests> + <help> + +**What it does** + +Takes as an input a matrix of n observations (columns, generally n RNAseq library) of k variables +(rows, generally k genes). + +k variables define a space of k dimensions. Any observation +of k expression values for k genes (the purpose of one RNAseq experiment) can be assigned +to a position in the k-dim space, of coordinates c1, c2, c3, ..., ck. + +Since visualisation in more than 3 dimensions is not easy for a human beeing, there is +a number of methods to "reduce" or "project" a k-dim space in a space of 2 or 3 dimensions. +This is of great help, not only to summarise the data, but also to find similarities, common trends +between the data (under the hypothesis that similar data are closer in the k-dimension space). + +This tool returns the visualisation of a dimensional reduction using either: + +* Principal Components Analysis (PCA) +* Hierarchical Clustering of Principal Components (HCPC) +* t-distributed Stochastic Neighbor Embedding + +The tool returns in addition the table of the coordinates of the observations (eg RNAseq libraries) +in the low dim space, which can be used for post-treatment or to further adjust the provided visualisation. + +** Contrast data with a factor ** +The tool offers the possibility to colour data points according to the levels of a factor. +To use the option "Factor to contrast data", provide a tabulated-separated, two-column table +with first column containing the cell/data library identifiers (same identifiers as those +provided as column headers in the input data table) and second column containing the corresponding +factor levels value. This table does not need to be sorted in the same order as in the data +table. It may also contain more identifiers than those provided in the data table. + + </help> + <citations> + <citation type="bibtex">@Article{, + title = {Visualizing High-Dimensional Data Using t-SNE}, + volume = {9}, + pages = {2579-2605}, + year = {2008}, + author = {L.J.P. {van der Maaten} and G.E. Hinton}, + journal = {Journal of Machine Learning Research}, + } + </citation> + <citation type="bibtex">@Article{, + title = {Accelerating t-SNE using Tree-Based Algorithms}, + volume = {15}, + pages = {3221-3245}, + year = {2014}, + author = {L.J.P. {van der Maaten}}, + journal = {Journal of Machine Learning Research}, + } + </citation> + <citation type="bibtex">@Manual{, + title = {{Rtsne}: T-Distributed Stochastic Neighbor Embedding using + Barnes-Hut Implementation}, + author = {Jesse H. Krijthe}, + year = {2015}, + note = {R package version 0.15}, + url = {https://github.com/jkrijthe/Rtsne}, + } + </citation> + </citations> +</tool>