comparison high_dim_visu.xml @ 0:cad0001b9cfb draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author artbio
date Mon, 24 Jun 2019 13:39:11 -0400
parents
children c756ab726a85
comparison
equal deleted inserted replaced
-1:000000000000 0:cad0001b9cfb
1 <tool id="high_dimensions_visualisation" name="Generate PCA, tSNE and HCPC" version="0.9.0">
2 <description>from highly dimensional expression data</description>
3 <requirements>
4 <requirement type="package" version="1.3.2=r3.3.2_0">r-optparse</requirement>
5 <requirement type="package" version="1.39=r3.3.2_0">r-factominer</requirement>
6 <requirement type="package" version="1.0.5=r3.3.2_0">r-factoextra</requirement>
7 <requirement type="package" version="0.13=r3.3.2_0">r-rtsne</requirement>
8 <requirement type="package" version="2.2.1=r3.3.2_0">r-ggplot2</requirement>
9 <requirement type="package" version="0.4.1=r3.3.2_0">r-ggfortify</requirement>
10 </requirements>
11 <stdio>
12 <exit_code range="1:" level="fatal" description="Tool exception" />
13 </stdio>
14 <command detect_errors="exit_code"><![CDATA[
15 Rscript $__tool_directory__/high_dim_visu.R
16 --data '$input'
17 --sep '$input_sep'
18 --colnames '$input_header'
19 #if $factor_condition.factor_choice == 'Yes'
20 --factor '$factor_condition.factor'
21 #end if
22 #if $labels == "yes":
23 --labels 'TRUE'
24 #else
25 --labels 'FALSE'
26 #end if
27 #if $coord == "yes":
28 --table_coordinates '$table_coordinates'
29 #end if
30 --visu_choice '$visualisation.visu_choice'
31 #if $visualisation.visu_choice == "tSNE":
32 --Rtsne_seed '$visualisation.Rtsne_seed'
33 --Rtsne_perplexity '$visualisation.Rtsne_perplexity'
34 --Rtsne_theta '$visualisation.Rtsne_theta'
35 --Rtsne_max_iter '$visualisation.Rtsne_max_iter'
36 --Rtsne_dims '$visualisation.Rtsne_dims'
37 --Rtsne_initial_dims '$visualisation.Rtsne_initial_dims'
38 --Rtsne_pca '$visualisation.Rtsne_pca'
39 --Rtsne_pca_center '$visualisation.Rtsne_pca_center'
40 --Rtsne_pca_scale '$visualisation.Rtsne_pca_scale'
41 --Rtsne_normalize '$visualisation.Rtsne_normalize'
42 --Rtsne_exaggeration_factor '$visualisation.Rtsne_exaggeration_factor'
43 #end if
44
45 #if $visualisation.visu_choice == "HCPC":
46 --HCPC_ncluster '$visualisation.HCPC_ncluster'
47 --HCPC_npc '$visualisation.HCPC_npc'
48 --HCPC_metric '$visualisation.HCPC_metric'
49 --HCPC_method '$visualisation.HCPC_method'
50 --HCPC_consol '$visualisation.HCPC_consol'
51 --HCPC_itermax '$visualisation.HCPC_itermax'
52 --HCPC_min '$visualisation.HCPC_min'
53 --HCPC_max '$visualisation.HCPC_max'
54 --HCPC_clusterCA '$visualisation.HCPC_clusterCA'
55 --HCPC_kk '$visualisation.HCPC_kk'
56 #end if
57
58 #if $visualisation.visu_choice == "PCA":
59 --PCA_npc '$visualisation.PCA_npc'
60 #end if
61
62
63 --pdf_out '$pdf_out'
64
65 ]]></command>
66 <inputs>
67 <param name="input" type="data" format="txt,tabular" label="expression data"/>
68 <param name="input_sep" type="select" label="Input column separator">
69 <option value="tab" selected="true">Tabs</option>
70 <option value=",">Comma</option>
71 </param>
72 <param name="input_header" type="select" label="Consider first line of input file as header?">
73 <option value="TRUE" selected="true">Yes</option>
74 <option value="FALSE">No</option>
75 </param>
76 <param name="labels" type="select" label="Add sample labels to scatter plot" >
77 <option value="no" selected="true">No Labels</option>
78 <option value="yes" >Label points</option>
79 </param>
80 <conditional name="factor_condition">
81 <param label="Do you wish to contrast cells with a factor" name="factor_choice" type="select">
82 <option value="Yes">Yes</option>
83 <option value="No" selected="true">No</option>
84 </param>
85 <when value="Yes">
86 <param name="factor" type="data" format="tabular" label="Factor to constrast data"
87 help="A two-column data frame, first column contains data labels, second column contains the levels of a factor to contrast visualisation" />
88 </when>
89 <when value="No">
90 </when>
91 </conditional>
92 <conditional name="visualisation">
93 <param label="Choose visualisation method" name="visu_choice" type="select">
94 <option value="PCA" selected="True">PCA</option>
95 <option value="HCPC">HCPC</option>
96 <option value="tSNE">t-SNE</option>
97 </param>
98 <when value="tSNE">
99 <param name="Rtsne_seed" value="42" type="integer" label="Seed value for reproducibility of t-SNE" help="Set to 42 as default" />
100 <param name="Rtsne_dims" value="2" type="integer" label="dims (t-SNE)" help="Output dimensionality (should not be greater than 3)" />
101 <param name="Rtsne_pca" type="select" label="pca (t-SNE)" help="Whether an initial PCA step should be performed" >
102 <option value="TRUE" selected="true">Yes</option>
103 <option value="FALSE">False</option>
104 </param>
105 <param name="Rtsne_initial_dims" value="50" type="integer" label="initial dims (t-SNE)" help="The number of dimensions that should be retained in the initial PCA step" />
106 <param name="Rtsne_pca_center" type="select" label="Centering data" help="Should data be centered before pca is applied? " >
107 <option value="TRUE" selected="true">Yes</option>
108 <option value="FALSE">False</option>
109 </param>
110 <param name="Rtsne_pca_scale" type="select" label="Scalling data" help="Should data be scaled before pca is applied? " >
111 <option value="TRUE">Yes</option>
112 <option value="FALSE" selected="true">False</option>
113 </param>
114 <param name="Rtsne_normalize" type="select" label="Normalisation of data"
115 help="Should variables (gene expressions) be normalized internally prior to distance calculations? " >
116 <option value="TRUE" selected="true">Yes</option>
117 <option value="FALSE">False</option>
118 </param>
119 <param name="Rtsne_perplexity" value="10.0" type="float" label="perplexity (t-SNE)" help="should be less than ((nbr observations)-1)/3" />
120 <param name="Rtsne_theta" value="1.0" type="float" label="theta (t-SNE)"/>
121 <param name="Rtsne_exaggeration_factor" value="12.0" type="float" label="Exageration factor" help="Exaggeration factor used to multiply the P matrix in the first part of the optimization" />
122 <param name="Rtsne_max_iter" value="1000" type="integer" label="Number of iterations (default: 1000)"
123 help="The number of iterations that Rtsne executes to improve low dim representation (gradient descent optimization)" />
124 </when>
125 <when value="HCPC">
126 <param name="HCPC_npc" value="5" type="integer" label="Number of principal components to keep"
127 help="The number of dimensions which are kept for HCPC analysis (default=5)" />
128 <param name="HCPC_ncluster" value="-1" type="integer" label="Number of clusters in Hierar. Clustering"
129 help="nb.clust, the number of clusters to consider in the hierarchical clustering. (default : -1, let HCPC to optimize the number)" />
130 <param name="HCPC_metric" type="select" label="Dissimilarity metric" help="Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan'? " >
131 <option value="euclidian" selected="true">euclidian</option>
132 <option value="manhattan">manhattan</option>
133 </param>
134 <param name="HCPC_method" type="select" label="Clustering method" help="Clustering method between 'ward', 'average', 'single', 'complete', 'weighted' " >
135 <option value="ward" selected="true">ward</option>
136 <option value="average">average</option>
137 <option value="single">single</option>
138 <option value="complete">complete</option>
139 <option value="weighted">weighted</option>
140 </param>
141 <param name="HCPC_consol" type="select" label="k-means consolidation" help="If TRUE, a k-means consolidation is performed" >
142 <option value="TRUE" selected="true">Yes</option>
143 <option value="FALSE">False</option>
144 </param>
145 <param name="HCPC_itermax" value="10" type="integer" label="Maximum number of iterations for consolidation"
146 help=" (default=10)" />
147 <param name="HCPC_min" value="3" type="integer" label="min number of clusters"
148 help=" The least possible number of clusters suggested (default=3)" />
149 <param name="HCPC_max" value="-1" type="text" label="max number of clusters"
150 help=" The higher possible number of clusters suggested, by default the minimum between 10 and the number of individuals divided by 2. (default=-1)" />
151 <param name="HCPC_clusterCA" type="select" label="clusterCA, Clustering against rows or columns" help="default(rows)" >
152 <option value="rows" selected="true">Rows</option>
153 <option value="cols">Columns</option>
154 </param>
155 <param name="HCPC_kk" value="-1" type="text" label="kk, Number of clusters used in a Kmeans preprocessing "
156 help="No k-means consolidation is done if a kk value is provided (default=-1)" />
157 </when>
158 <when value="PCA">
159 <param name="PCA_npc" value="5" type="integer" label="Number of principal components to keep" help="The number of dimensions which are kept for PCA analysis (default=5)" />
160 </when>
161 </conditional>
162 <param label="Return scatter plot table coordinates" name="coord" type="select">
163 <option value="no" selected="True">No</option>
164 <option value="yes">Yes</option>
165 </param>
166
167 </inputs>
168 <outputs>
169 <data name="pdf_out" format="pdf" label="${visualisation.visu_choice} on ${on_string}" />
170 <data name="table_coordinates" format="tabular" label="Scatter plot coordinates from ${tool.name} on ${on_string}" >
171 <filter>coord == 'yes'</filter>
172 </data>
173 </outputs>
174 <tests>
175 <!-- test PCA -->
176 <test>
177 <param name="input" value="cpm_input.tsv" ftype="txt"/>
178 <param name="labels" value="yes" />
179 <param name="visu_choice" value="PCA" />
180 <param name="factor_choice" value="No" />
181 <output name="pdf_out" file="pca.labels.pdf" ftype="pdf"/>
182 </test>
183 <test>
184 <param name="input" value="cpm_input.tsv" ftype="txt"/>
185 <param name="labels" value="no" />
186 <param name="visu_choice" value="PCA" />
187 <param name="factor_choice" value="No" />
188 <output name="pdf_out" file="pca.nolabels.pdf" ftype="pdf"/>
189 </test>
190 <!-- test Coordinates tables on PCA -->
191 <test>
192 <param name="input" value="cpm_input.tsv" ftype="txt"/>
193 <param name="labels" value="no" />
194 <param name="visu_choice" value="PCA" />
195 <param name="coord" value="yes" />
196 <param name="factor_choice" value="No" />
197 <output name="pdf_out" file="pca.nolabels.pdf" ftype="pdf"/>
198 <output name="table_coordinates" file="pca.coord.tab" ftype="tabular"/>
199 </test>
200 <!-- test factor contrasting on PCA -->
201 <test>
202 <param name="input" value="cpm_input.tsv" ftype="txt"/>
203 <param name="labels" value="no" />
204 <param name="visu_choice" value="PCA" />
205 <param name="factor_choice" value="Yes" />
206 <param name="factor" value="factor.tsv" ftype="txt"/>
207 <output name="pdf_out" file="pca.nolabels.factors.pdf" ftype="pdf"/>
208 </test>
209 <test>
210 <param name="input" value="cpm_input.tsv" ftype="txt"/>
211 <param name="labels" value="no" />
212 <param name="visu_choice" value="PCA" />
213 <param name="factor_choice" value="Yes" />
214 <param name="factor" value="shuffled_factor.tsv" ftype="txt"/>
215 <output name="pdf_out" file="pca.nolabels.factors.pdf" ftype="pdf"/>
216 </test>
217 <!-- test HCPC -->
218 <test>
219 <param name="input" value="cpm_input.tsv" ftype="txt"/>
220 <param name="labels" value="yes" />
221 <param name="visu_choice" value="HCPC" />
222 <param name="HCPC_npc" value="5"/>
223 <param name="HCPC_ncluster" value="-1"/>
224 <output name="pdf_out" file="hcpc.labels.pdf" ftype="pdf"/>
225 </test>
226 <!-- test factor contrasting on HCPC -->
227 <test>
228 <param name="input" value="cpm_input.tsv" ftype="txt"/>
229 <param name="labels" value="no" />
230 <param name="visu_choice" value="HCPC" />
231 <param name="HCPC_npc" value="5"/>
232 <param name="HCPC_ncluster" value="-1"/>
233 <param name="factor_choice" value="Yes" />
234 <param name="factor" value="factor.tsv" ftype="txt"/>
235 <output name="pdf_out" file="hcpc.nolabels.factor.pdf" ftype="pdf"/>
236 </test>
237 <test>
238 <param name="input" value="cpm_input.tsv" ftype="txt"/>
239 <param name="labels" value="no" />
240 <param name="HCPC_npc" value="5"/>
241 <param name="HCPC_ncluster" value="-1"/>
242 <param name="visu_choice" value="HCPC" />
243 <output name="pdf_out" file="hcpc.nolabels.pdf" ftype="pdf"/>
244 </test>
245 <test>
246 <param name="input" value="cpm_input.tsv" ftype="txt"/>
247 <param name="labels" value="yes" />
248 <param name="visu_choice" value="HCPC" />
249 <param name="coord" value="yes" />
250 <param name="HCPC_method" value="average"/>
251 <param name="HCPC_metric" value="manhattan"/>
252 <param name="HCPC_npc" value="4" />
253 <output name="pdf_out" file="hcpc-2.labels.pdf" ftype="pdf"/>
254 <output name="table_coordinates" file="hcpc-2.coord.tab" ftype="tabular"/>
255 </test>
256 <test>
257 <param name="input" value="cpm_input.tsv" ftype="txt"/>
258 <param name="labels" value="yes" />
259 <param name="visu_choice" value="HCPC" />
260 <param name="coord" value="yes" />
261 <param name="HCPC_method" value="single"/>
262 <param name="HCPC_metric" value="euclidian"/>
263 <param name="HCPC_npc" value="4" />
264 <param name="HCPC_clusterCA" value="cols" />
265 <output name="pdf_out" file="hcpc-3.labels.pdf" ftype="pdf"/>
266 <output name="table_coordinates" file="hcpc-3.coord.tab" ftype="tabular"/>
267 </test>
268 <!-- test t-SNE -->
269 <test>
270 <param name="input" value="cpm_input.tsv" ftype="txt"/>
271 <param name="labels" value="yes" />
272 <param name="visu_choice" value="tSNE" />
273 <param name="Rtsne_seed" value="49"/>
274 <param name="Rtsne_perplexity" value="10"/>
275 <param name="Rtsne_theta" value="1" />
276 <output name="pdf_out" file="tsne.labels.pdf" ftype="pdf"/>
277 </test>
278 <test>
279 <param name="input" value="cpm_input.tsv" ftype="txt"/>
280 <param name="labels" value="no" />
281 <param name="visu_choice" value="tSNE" />
282 <param name="Rtsne_seed" value="49"/>
283 <param name="Rtsne_perplexity" value="10"/>
284 <param name="Rtsne_theta" value="1" />
285 <output name="pdf_out" file="tsne.nolabels.pdf" ftype="pdf"/>
286 </test>
287 <test>
288 <param name="input" value="cpm_input.tsv" ftype="txt"/>
289 <param name="labels" value="no" />
290 <param name="visu_choice" value="tSNE" />
291 <param name="coord" value="yes" />
292 <param name="Rtsne_seed" value="42"/>
293 <param name="Rtsne_perplexity" value="5.0"/>
294 <param name="Rtsne_theta" value="1.0" />
295 <param name="Rtsne_dims" value="3" />
296 <param name="Rtsne_exaggeration_factor" value="15.0" />
297 <output name="pdf_out" file="tsne-2.nolabels.pdf" ftype="pdf"/>
298 <output name="table_coordinates" file="tsne-2.coord.tab" ftype="tabular"/>
299 </test>
300 <!-- test factor contrasting on t-SNE -->
301 <test>
302 <param name="input" value="cpm_input.tsv" ftype="txt"/>
303 <param name="labels" value="yes" />
304 <param name="visu_choice" value="tSNE" />
305 <param name="factor_choice" value="Yes" />
306 <param name="factor" value="shuffled_factor.tsv" ftype="txt"/>
307 <param name="Rtsne_seed" value="49"/>
308 <param name="Rtsne_perplexity" value="10"/>
309 <param name="Rtsne_theta" value="1" />
310 <output name="pdf_out" file="tsne.labels.factor.pdf" ftype="pdf"/>
311 </test>
312 </tests>
313 <help>
314
315 **What it does**
316
317 Takes as an input a matrix of n observations (columns, generally n RNAseq library) of k variables
318 (rows, generally k genes).
319
320 k variables define a space of k dimensions. Any observation
321 of k expression values for k genes (the purpose of one RNAseq experiment) can be assigned
322 to a position in the k-dim space, of coordinates c1, c2, c3, ..., ck.
323
324 Since visualisation in more than 3 dimensions is not easy for a human beeing, there is
325 a number of methods to "reduce" or "project" a k-dim space in a space of 2 or 3 dimensions.
326 This is of great help, not only to summarise the data, but also to find similarities, common trends
327 between the data (under the hypothesis that similar data are closer in the k-dimension space).
328
329 This tool returns the visualisation of a dimensional reduction using either:
330
331 * Principal Components Analysis (PCA)
332 * Hierarchical Clustering of Principal Components (HCPC)
333 * t-distributed Stochastic Neighbor Embedding
334
335 The tool returns in addition the table of the coordinates of the observations (eg RNAseq libraries)
336 in the low dim space, which can be used for post-treatment or to further adjust the provided visualisation.
337
338 ** Contrast data with a factor **
339 The tool offers the possibility to colour data points according to the levels of a factor.
340 To use the option "Factor to contrast data", provide a tabulated-separated, two-column table
341 with first column containing the cell/data library identifiers (same identifiers as those
342 provided as column headers in the input data table) and second column containing the corresponding
343 factor levels value. This table does not need to be sorted in the same order as in the data
344 table. It may also contain more identifiers than those provided in the data table.
345
346 </help>
347 <citations>
348 <citation type="bibtex">@Article{,
349 title = {Visualizing High-Dimensional Data Using t-SNE},
350 volume = {9},
351 pages = {2579-2605},
352 year = {2008},
353 author = {L.J.P. {van der Maaten} and G.E. Hinton},
354 journal = {Journal of Machine Learning Research},
355 }
356 </citation>
357 <citation type="bibtex">@Article{,
358 title = {Accelerating t-SNE using Tree-Based Algorithms},
359 volume = {15},
360 pages = {3221-3245},
361 year = {2014},
362 author = {L.J.P. {van der Maaten}},
363 journal = {Journal of Machine Learning Research},
364 }
365 </citation>
366 <citation type="bibtex">@Manual{,
367 title = {{Rtsne}: T-Distributed Stochastic Neighbor Embedding using
368 Barnes-Hut Implementation},
369 author = {Jesse H. Krijthe},
370 year = {2015},
371 note = {R package version 0.15},
372 url = {https://github.com/jkrijthe/Rtsne},
373 }
374 </citation>
375 </citations>
376 </tool>