diff high_dim_visu.xml @ 0:cad0001b9cfb draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author artbio
date Mon, 24 Jun 2019 13:39:11 -0400
parents
children c756ab726a85
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/high_dim_visu.xml	Mon Jun 24 13:39:11 2019 -0400
@@ -0,0 +1,376 @@
+<tool id="high_dimensions_visualisation" name="Generate PCA, tSNE and HCPC" version="0.9.0">
+    <description>from highly dimensional expression data</description>
+    <requirements>
+        <requirement type="package" version="1.3.2=r3.3.2_0">r-optparse</requirement>
+        <requirement type="package" version="1.39=r3.3.2_0">r-factominer</requirement>
+        <requirement type="package" version="1.0.5=r3.3.2_0">r-factoextra</requirement>
+        <requirement type="package" version="0.13=r3.3.2_0">r-rtsne</requirement>
+        <requirement type="package" version="2.2.1=r3.3.2_0">r-ggplot2</requirement>
+        <requirement type="package" version="0.4.1=r3.3.2_0">r-ggfortify</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Tool exception" />
+    </stdio>
+    <command detect_errors="exit_code"><![CDATA[ 
+        Rscript $__tool_directory__/high_dim_visu.R 
+            --data '$input'
+            --sep '$input_sep'
+            --colnames '$input_header'
+            #if $factor_condition.factor_choice == 'Yes'
+                --factor '$factor_condition.factor'
+            #end if
+            #if $labels == "yes":
+                --labels 'TRUE'
+            #else
+                --labels 'FALSE'
+            #end if
+            #if $coord == "yes":
+             	--table_coordinates '$table_coordinates'
+            #end if
+            --visu_choice '$visualisation.visu_choice'
+            #if $visualisation.visu_choice == "tSNE":
+                --Rtsne_seed '$visualisation.Rtsne_seed'
+                --Rtsne_perplexity '$visualisation.Rtsne_perplexity'
+                --Rtsne_theta '$visualisation.Rtsne_theta'
+                --Rtsne_max_iter '$visualisation.Rtsne_max_iter'
+                --Rtsne_dims '$visualisation.Rtsne_dims'
+                --Rtsne_initial_dims '$visualisation.Rtsne_initial_dims'
+                --Rtsne_pca '$visualisation.Rtsne_pca'
+                --Rtsne_pca_center '$visualisation.Rtsne_pca_center'
+                --Rtsne_pca_scale '$visualisation.Rtsne_pca_scale'
+                --Rtsne_normalize '$visualisation.Rtsne_normalize'
+                --Rtsne_exaggeration_factor '$visualisation.Rtsne_exaggeration_factor'
+            #end if
+            
+            #if $visualisation.visu_choice == "HCPC":
+                --HCPC_ncluster '$visualisation.HCPC_ncluster'
+                --HCPC_npc '$visualisation.HCPC_npc'
+                --HCPC_metric '$visualisation.HCPC_metric'
+                --HCPC_method '$visualisation.HCPC_method'
+                --HCPC_consol '$visualisation.HCPC_consol'
+                --HCPC_itermax '$visualisation.HCPC_itermax'
+                --HCPC_min '$visualisation.HCPC_min'
+                --HCPC_max '$visualisation.HCPC_max'
+                --HCPC_clusterCA '$visualisation.HCPC_clusterCA'
+                --HCPC_kk '$visualisation.HCPC_kk'
+            #end if
+            
+            #if $visualisation.visu_choice == "PCA":
+                --PCA_npc '$visualisation.PCA_npc'
+            #end if
+            
+           
+            --pdf_out '$pdf_out'
+            
+]]></command>
+    <inputs>
+        <param name="input" type="data" format="txt,tabular" label="expression data"/>
+        <param name="input_sep" type="select" label="Input column separator">
+            <option value="tab" selected="true">Tabs</option>
+            <option value=",">Comma</option>
+        </param>
+        <param name="input_header" type="select" label="Consider first line of input file as header?">
+            <option value="TRUE" selected="true">Yes</option>
+            <option value="FALSE">No</option>
+        </param>
+        <param name="labels" type="select" label="Add sample labels to scatter plot" >
+            <option value="no" selected="true">No Labels</option>
+            <option value="yes" >Label points</option>
+        </param>
+        <conditional name="factor_condition">
+            <param label="Do you wish to contrast cells with a factor" name="factor_choice" type="select">
+                <option value="Yes">Yes</option>
+                <option value="No" selected="true">No</option>
+            </param>
+            <when value="Yes">
+                <param name="factor" type="data" format="tabular" label="Factor to constrast data"
+                       help="A two-column data frame, first column contains data labels, second column contains the levels of a factor to contrast visualisation" />
+            </when>
+            <when value="No">
+            </when>
+        </conditional>
+        <conditional name="visualisation">
+            <param label="Choose visualisation method" name="visu_choice" type="select">
+                <option value="PCA" selected="True">PCA</option>
+                <option value="HCPC">HCPC</option>
+                <option value="tSNE">t-SNE</option>
+            </param>
+            <when value="tSNE">
+                <param name="Rtsne_seed" value="42" type="integer" label="Seed value for reproducibility of t-SNE" help="Set to 42 as default" />
+                <param name="Rtsne_dims" value="2" type="integer" label="dims (t-SNE)" help="Output dimensionality (should not be greater than 3)" /> 
+				<param name="Rtsne_pca"  type="select" label="pca (t-SNE)" help="Whether an initial PCA step should be performed" > 
+					<option value="TRUE" selected="true">Yes</option>
+					<option value="FALSE">False</option>
+				</param>
+                <param name="Rtsne_initial_dims" value="50" type="integer" label="initial dims (t-SNE)" help="The number of dimensions that should be retained in the initial PCA step" /> 
+				<param name="Rtsne_pca_center"  type="select" label="Centering data" help="Should data be centered before pca is applied? " > 
+					<option value="TRUE" selected="true">Yes</option>
+					<option value="FALSE">False</option>
+				</param>
+				<param name="Rtsne_pca_scale"  type="select" label="Scalling data" help="Should data be scaled before pca is applied? " > 
+					<option value="TRUE">Yes</option>
+					<option value="FALSE" selected="true">False</option>
+				</param>
+				<param name="Rtsne_normalize"  type="select" label="Normalisation of data"
+				             help="Should variables (gene expressions) be normalized internally prior to distance calculations? " > 
+					<option value="TRUE" selected="true">Yes</option>
+					<option value="FALSE">False</option>
+				</param>
+                <param name="Rtsne_perplexity" value="10.0" type="float" label="perplexity (t-SNE)" help="should be less than ((nbr observations)-1)/3" /> 
+                <param name="Rtsne_theta" value="1.0" type="float" label="theta (t-SNE)"/>
+				<param name="Rtsne_exaggeration_factor" value="12.0" type="float" label="Exageration factor" help="Exaggeration factor used to multiply the P matrix in the first part of the optimization" />
+                <param name="Rtsne_max_iter" value="1000" type="integer" label="Number of iterations (default: 1000)"
+                             help="The number of iterations that Rtsne executes to improve low dim representation (gradient descent optimization)" /> 
+            </when>
+            <when value="HCPC">
+                <param name="HCPC_npc" value="5" type="integer" label="Number of principal components to keep"
+                       help="The number of dimensions which are kept for HCPC analysis (default=5)" />
+                <param name="HCPC_ncluster" value="-1" type="integer" label="Number of clusters in Hierar. Clustering"
+                       help="nb.clust, the number of clusters to consider in the hierarchical clustering. (default : -1, let HCPC to optimize the number)" />
+				<param name="HCPC_metric"  type="select" label="Dissimilarity metric" help="Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan'? " > 
+					<option value="euclidian" selected="true">euclidian</option>
+					<option value="manhattan">manhattan</option>
+				</param>
+			    <param name="HCPC_method"  type="select" label="Clustering method" help="Clustering method between 'ward', 'average', 'single', 'complete', 'weighted' " > 
+					<option value="ward" selected="true">ward</option>
+					<option value="average">average</option>
+					<option value="single">single</option>
+					<option value="complete">complete</option>
+					<option value="weighted">weighted</option>
+				</param>
+				<param name="HCPC_consol"  type="select" label="k-means consolidation" help="If TRUE, a k-means consolidation is performed" > 
+					<option value="TRUE" selected="true">Yes</option>
+					<option value="FALSE">False</option>
+				</param>
+				<param name="HCPC_itermax" value="10" type="integer" label="Maximum number of iterations for consolidation"
+                       help=" (default=10)" />
+                <param name="HCPC_min" value="3" type="integer" label="min number of clusters"
+                       help=" The least possible number of clusters suggested (default=3)" />
+                <param name="HCPC_max" value="-1" type="text" label="max number of clusters"
+                       help=" The higher possible number of clusters suggested, by default the minimum between 10 and the number of individuals divided by 2. (default=-1)" />
+				<param name="HCPC_clusterCA"  type="select" label="clusterCA, Clustering against rows or columns" help="default(rows)" > 
+					<option value="rows" selected="true">Rows</option>
+					<option value="cols">Columns</option>
+				</param>
+				 <param name="HCPC_kk" value="-1" type="text" label="kk, Number of clusters used in a Kmeans preprocessing "
+                       help="No k-means consolidation is done if a kk value is provided (default=-1)" />
+            </when>
+            <when value="PCA">
+            	  <param name="PCA_npc" value="5" type="integer" label="Number of principal components to keep" help="The number of dimensions which are kept for PCA analysis (default=5)" />
+            </when>
+        </conditional>
+            <param label="Return scatter plot table coordinates" name="coord" type="select">
+                <option value="no" selected="True">No</option>
+                <option value="yes">Yes</option>
+            </param>
+             
+    </inputs>
+    <outputs>
+        <data name="pdf_out" format="pdf" label="${visualisation.visu_choice} on ${on_string}" />
+        <data name="table_coordinates" format="tabular" label="Scatter plot coordinates from ${tool.name} on ${on_string}" >
+            <filter>coord == 'yes'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- test PCA -->
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="yes" />
+            <param name="visu_choice" value="PCA" />
+            <param name="factor_choice" value="No" />
+            <output name="pdf_out" file="pca.labels.pdf" ftype="pdf"/>
+        </test>
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="no" />
+            <param name="visu_choice" value="PCA" />
+            <param name="factor_choice" value="No" />
+            <output name="pdf_out" file="pca.nolabels.pdf" ftype="pdf"/>
+        </test>
+        <!-- test Coordinates tables on PCA -->
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="no" />
+            <param name="visu_choice" value="PCA" />
+            <param name="coord" value="yes" />
+            <param name="factor_choice" value="No" />
+            <output name="pdf_out" file="pca.nolabels.pdf" ftype="pdf"/>
+            <output name="table_coordinates" file="pca.coord.tab" ftype="tabular"/>       
+		</test>
+        <!-- test factor contrasting on PCA -->
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="no" />
+            <param name="visu_choice" value="PCA" />
+            <param name="factor_choice" value="Yes" />
+            <param name="factor" value="factor.tsv" ftype="txt"/>
+            <output name="pdf_out" file="pca.nolabels.factors.pdf" ftype="pdf"/>
+        </test>
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="no" />
+            <param name="visu_choice" value="PCA" />
+            <param name="factor_choice" value="Yes" />
+            <param name="factor" value="shuffled_factor.tsv" ftype="txt"/>
+            <output name="pdf_out" file="pca.nolabels.factors.pdf" ftype="pdf"/>
+        </test>
+        <!-- test HCPC -->
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="yes" />
+            <param name="visu_choice" value="HCPC" />
+            <param name="HCPC_npc" value="5"/>
+            <param name="HCPC_ncluster" value="-1"/>
+            <output name="pdf_out" file="hcpc.labels.pdf" ftype="pdf"/>
+        </test>
+        <!-- test factor contrasting on HCPC -->
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="no" />
+            <param name="visu_choice" value="HCPC" />
+            <param name="HCPC_npc" value="5"/>
+            <param name="HCPC_ncluster" value="-1"/>
+            <param name="factor_choice" value="Yes" />
+            <param name="factor" value="factor.tsv" ftype="txt"/>
+            <output name="pdf_out" file="hcpc.nolabels.factor.pdf" ftype="pdf"/>
+        </test>
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="no" />
+            <param name="HCPC_npc" value="5"/>
+            <param name="HCPC_ncluster" value="-1"/>
+            <param name="visu_choice" value="HCPC" />
+            <output name="pdf_out" file="hcpc.nolabels.pdf" ftype="pdf"/>
+        </test>
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="yes" />
+            <param name="visu_choice" value="HCPC" />
+            <param name="coord" value="yes" />
+            <param name="HCPC_method" value="average"/>
+            <param name="HCPC_metric" value="manhattan"/>
+            <param name="HCPC_npc" value="4" />
+            <output name="pdf_out" file="hcpc-2.labels.pdf" ftype="pdf"/>
+            <output name="table_coordinates" file="hcpc-2.coord.tab" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="yes" />
+            <param name="visu_choice" value="HCPC" />
+            <param name="coord" value="yes" />
+            <param name="HCPC_method" value="single"/>
+            <param name="HCPC_metric" value="euclidian"/>
+            <param name="HCPC_npc" value="4" />
+            <param name="HCPC_clusterCA" value="cols" />
+            <output name="pdf_out" file="hcpc-3.labels.pdf" ftype="pdf"/>
+            <output name="table_coordinates" file="hcpc-3.coord.tab" ftype="tabular"/>
+        </test>
+        <!-- test t-SNE -->
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="yes" />
+            <param name="visu_choice" value="tSNE" />
+            <param name="Rtsne_seed" value="49"/>
+            <param name="Rtsne_perplexity" value="10"/>
+            <param name="Rtsne_theta" value="1" />
+            <output name="pdf_out" file="tsne.labels.pdf" ftype="pdf"/>
+        </test>
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="no" />
+            <param name="visu_choice" value="tSNE" />
+            <param name="Rtsne_seed" value="49"/>
+            <param name="Rtsne_perplexity" value="10"/>
+            <param name="Rtsne_theta" value="1" />
+            <output name="pdf_out" file="tsne.nolabels.pdf" ftype="pdf"/>
+        </test>
+       	<test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="no" />
+            <param name="visu_choice" value="tSNE" />
+            <param name="coord" value="yes" />
+            <param name="Rtsne_seed" value="42"/>
+            <param name="Rtsne_perplexity" value="5.0"/>
+            <param name="Rtsne_theta" value="1.0" />
+            <param name="Rtsne_dims" value="3" />
+            <param name="Rtsne_exaggeration_factor" value="15.0" />
+            <output name="pdf_out" file="tsne-2.nolabels.pdf" ftype="pdf"/>
+            <output name="table_coordinates" file="tsne-2.coord.tab" ftype="tabular"/>
+        </test>
+        <!-- test factor contrasting on t-SNE -->
+        <test>
+            <param name="input" value="cpm_input.tsv" ftype="txt"/>
+            <param name="labels" value="yes" />
+            <param name="visu_choice" value="tSNE" />
+            <param name="factor_choice" value="Yes" />
+            <param name="factor" value="shuffled_factor.tsv" ftype="txt"/>
+            <param name="Rtsne_seed" value="49"/>
+            <param name="Rtsne_perplexity" value="10"/>
+            <param name="Rtsne_theta" value="1" />
+            <output name="pdf_out" file="tsne.labels.factor.pdf" ftype="pdf"/>
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+Takes as an input a matrix of n observations (columns, generally n RNAseq library) of k variables
+(rows, generally k genes).
+
+k variables define a space of k dimensions. Any observation
+of k expression values for k genes (the purpose of one RNAseq experiment) can be assigned
+to a position in the k-dim space, of coordinates c1, c2, c3, ..., ck.
+
+Since visualisation in more than 3 dimensions is not easy for a human beeing, there is
+a number of methods to "reduce" or "project" a k-dim space in a space of 2 or 3 dimensions.
+This is of great help, not only to summarise the data, but also to find similarities, common trends
+between the data (under the hypothesis that similar data are closer in the k-dimension space).
+
+This tool returns the visualisation of a dimensional reduction using either:
+
+* Principal Components Analysis (PCA)
+* Hierarchical Clustering of Principal Components (HCPC)
+* t-distributed Stochastic Neighbor Embedding
+
+The tool returns in addition the table of the coordinates of the observations (eg RNAseq libraries)
+in the low dim space, which can be used for post-treatment or to further adjust the provided visualisation.
+
+** Contrast data with a factor **
+The tool offers the possibility to colour data points according to the levels of a factor.
+To use the option "Factor to contrast data", provide a tabulated-separated, two-column table
+with first column containing the cell/data library identifiers (same identifiers as those
+provided as column headers in the input data table) and second column containing the corresponding
+factor levels value. This table does not need to be sorted in the same order as in the data
+table. It may also contain more identifiers than those provided in the data table.
+
+    </help>
+    <citations>
+        <citation type="bibtex">@Article{,
+            title = {Visualizing High-Dimensional Data Using t-SNE},
+            volume = {9},
+            pages = {2579-2605},
+            year = {2008},
+            author = {L.J.P. {van der Maaten} and G.E. Hinton},
+            journal = {Journal of Machine Learning Research},
+            }
+        </citation>
+        <citation type="bibtex">@Article{,
+            title = {Accelerating t-SNE using Tree-Based Algorithms},
+            volume = {15},
+            pages = {3221-3245},
+            year = {2014},
+            author = {L.J.P. {van der Maaten}},
+            journal = {Journal of Machine Learning Research},
+            }
+        </citation>
+        <citation type="bibtex">@Manual{,
+            title = {{Rtsne}: T-Distributed Stochastic Neighbor Embedding using
+            Barnes-Hut Implementation},
+            author = {Jesse H. Krijthe},
+            year = {2015},
+            note = {R package version 0.15},
+            url = {https://github.com/jkrijthe/Rtsne},
+            }
+        </citation>
+  </citations>
+</tool>