comparison biosigner_config.xml @ 0:48e4be935243 draft

planemo upload for repository https://github.com/workflow4metabolomics/biosigner.git commit b8af709c9fd6ed283fc4e4249dcf692556927b2d
author ethevenot
date Wed, 27 Jul 2016 11:40:20 -0400
parents
children 4ff502a46189
comparison
equal deleted inserted replaced
-1:000000000000 0:48e4be935243
1 <tool id="biosigner" name="Biosigner" version="2.2.2">
2 <description>Molecular signature discovery from omics data</description>
3
4 <requirements>
5 <requirement type="package" version="3.2.2">R</requirement>
6 <requirement type="package">r-batch</requirement>
7 <requirement type="package">bioconductor-biosigner</requirement>
8 </requirements>
9
10 <command><![CDATA[
11 $__tool_directory__/biosigner_wrapper.R
12
13 dataMatrix_in "$dataMatrix_in"
14 sampleMetadata_in "$sampleMetadata_in"
15 variableMetadata_in "$variableMetadata_in"
16
17 respC "$respC"
18
19 #if $advCpt.opcC == "full"
20 methodC "$advCpt.methodC"
21 bootI "$advCpt.bootI"
22 tierC "$advCpt.tierC"
23 pvalN "$advCpt.pvalN"
24 seedI "$advCpt.seedI"
25 #end if
26
27 variableMetadata_out "$variableMetadata_out"
28 figure_tier "$figure_tier"
29 figure_boxplot "$figure_boxplot"
30 information "$information"
31 ]]></command>
32
33 <inputs>
34 <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, sep: tabular" />
35 <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata, decimal: '.', missing: NA, mode: character and numerical, sep: tabular" />
36 <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata, decimal: '.', missing: NA, mode: character and numerical, sep: tabular" />
37 <param name="respC" label="Sample classes" type="text" value = "" help="Column of sampleMetadata containing 2 types of strings (e.g., 'case' and 'control')" />
38
39 <conditional name="advCpt">
40 <param name="opcC" type="select" label="Advanced computational parameters" >
41 <option value="default" selected="true">Use default</option>
42 <option value="full">Full parameter list</option>
43 </param>
44 <when value="default"/>
45 <when value="full">
46 <param name="methodC" label="Classification method(s)" type="select" help="">
47 <option value="all" selected="true">all</option>
48 <option value="plsda">PLS-DA</option>
49 <option value="randomforest">Random Forest</option>
50 <option value="svm">SVM</option>
51 </param>
52 <param name="bootI" type="integer" value="50" label="Number of bootstraps" help=""/>
53 <param name="tierC" label="Selection tier(s)" type="select" help="">
54 <option value="S" selected="true">S</option>
55 <option value="A">S+A</option>
56 </param>
57 <param name="pvalN" type="float" value="0.05" label="p-value threshold" help="Must be between 0 and 1"/>
58 <param name="seedI" type="integer" value="0" label="Seed" help="Select an integer (e.g., 123) if you want to obtain exactly the same signatures when re-running the algorithm; 0 means that no seed is selected"/>
59 </when>
60 </conditional>
61
62 </inputs>
63
64 <outputs>
65 <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data>
66 <data name="figure_tier" label="${tool.name}__figure-tier.pdf" format="pdf"/>
67 <data name="figure_boxplot" label="${tool.name}__figure-boxplot.pdf" format="pdf"/>
68 <data name="information" label="${tool.name}__information.txt" format="txt"/>
69 </outputs>
70
71 <tests>
72 <test>
73 <param name="dataMatrix_in" value="dataMatrix.tsv"/>
74 <param name="sampleMetadata_in" value="sampleMetadata.tsv"/>
75 <param name="variableMetadata_in" value="variableMetadata.tsv"/>
76 <param name="respC" value="gender"/>
77 <param name="opcC" value="full"/>
78 <param name="methodC" value="all"/>
79 <param name="bootI" value="5"/>
80 <param name="tierC" value="S"/>
81 <param name="pvalN" value="0.05"/>
82 <param name="seedI" value="123"/>
83 <output name="variableMetadata_out" file="variableMetadata.out"/>
84 </test>
85 </tests>
86
87 <help>
88
89 .. class:: infomark
90
91 **Author** Philippe Rinaudo and Etienne Thevenot (CEA, LIST, MetaboHUB Paris, etienne.thevenot@cea.fr)
92
93 ---------------------------------------------------
94
95 .. class:: infomark
96
97 **Please cite**
98
99 Philippe Rinaudo, Christophe Junot and Etienne A. Thevenot. *biosigner*: A new method for the discovery of restricted and stable molecular signatures from omics data. *submitted*.
100
101 ---------------------------------------------------
102
103 .. class:: infomark
104
105 **R package**
106
107 The *biosigner* package has been submitted to the bioconductor repository (http://bioconductor.org/packages/biosigner).
108
109 ---------------------------------------------------
110
111 .. class:: infomark
112
113 **Tool updates**
114
115 See the **NEWS** section at the bottom of this page
116
117 ---------------------------------------------------
118
119 ==========================================================
120 *biosigner*: Molecular signature discovery from omics data
121 ==========================================================
122
123 -----------
124 Description
125 -----------
126
127 High-throughput, non-targeted, technologies such as transcriptomics, proteomics and metabolomics, are widely used to **discover molecules** which allow to efficiently discriminate between biological or clinical conditions of interest (e.g., disease vs control states). Powerful **machine learning** approaches such as Partial Least Square Discriminant Analysis (PLS-DA), Random Forest (RF) and Support Vector Machines (SVM) have been shown to achieve high levels of prediction accuracy.
128
129 **Feature selection**, i.e., the selection of the few features (i.e., the molecular signature) which are of highest discriminating value, is a critical step in building a robust and relevant classifier (Guyon and Elisseeff, 2003): First, dimension reduction is usefull to limit the risk of overfitting and reduce the prediction variability of the model; second, intrepretation of the molecular signature is facilitated; third, in case of the development of diagnostic product, a restricted list is required for the subsequent validation steps (Rifai et al, 2006).
130
131 Since the comprehensive analysis of all combinations of features is not computationally tractable, several selection techniques have been described (Saeys et al, 2007). The major challenge for such methods is to be fast and extract **restricted and stable molecular signatures** which still provide high performance of the classifier (Gromski et al, 2014; Determan, 2015).
132
133 The **biosigner** module implements a new feature selection algorithm to assess the relevance of the variables for the prediction performances of the classifier (Rinaudo et al, submitted). Three binary classifiers can be run in parallel, namely **PLS-DA**, **Random Forest** and **SVM**, as the performances of each machine learning approach may vary depending on the structure of the dataset. The algorithm computes the *tier* of each feature for the selected classifer(s): tier *S* corresponds to the final signature, i.e., features which have been found significant in all the selection steps; features with tier *A* have been found significant in all but the last selection, and so on for tier *B* to *E*. It returns the **signature** (by default from the *S* tier) for each of the selected classifier as an additional column of the **variableMetadata** table. In addition the *tiers* and **individual boxplots** of the selected features are returned.
134
135 The module has been successfully applied to **transcriptomics** and **metabolomics** data.
136
137 Note:
138 | 1) Only **binary** classification is currently available,
139 | 2) If the **dataMatrix** contains **missing** values (NA), these features will be removed prior to modeling with Random Forest and SVM (in contrast, the NIPALS algorithm from PLS-DA can handle missing values),
140 | 3) As the algorithm relies on bootstrapping, re-running the module may result in slightly different results. To ensure that returned results are exactly the same, the **seed** (advanced) parameter can be used.
141 |
142
143
144 ---------------------------------------------------
145
146 .. class:: infomark
147
148 **References**
149
150 | Determan C. (2015). Optimal algorithm for metabolomics classification and feature selection varies by dataset. International *Journal of Biology* 7, 100-115.
151 | Gromski P.S., Xu Y., Correa E., Ellis D.I., Turner M.L. and Goodacre R. (2014). A comparative investigation of modern feature selection and classification approaches for the analysis of mass spectrometry data . *Analytica Chimica Acta* 829, 1-8.
152 | Guyon I. and Elisseeff A. (2003). An introduction to variable and feature selection. *Journal of Machine Learning Research* 3, 1157-1182.
153 | Rifai N., Gillette M.A. and Carr S.A. (2006). Protein biomarker discovery and validation: the long and uncertain path to clinical utility. *Nature Biotechnology* 24, 971-983.
154 | Rinaudo P., Junot C. and Thevenot E.A. *biosigner*: A new method for the discovery of restricted and stable molecular signatures from omics data. *submitted*.
155 | Saeys Y., Inza I. and Larranaga P. (2007). A review of feature selection techniques in bioinformatics. *Bioinformatics* 23, 2507-2517.
156
157 ---------------------------------------------------
158
159 -----------------
160 Workflow position
161 -----------------
162
163 .. image:: biosigner_workflowPositionImage.png
164 :width: 600
165
166 -----------
167 Input files
168 -----------
169
170 +---------------------------+------------+
171 | File | Format |
172 +===========================+============+
173 | 1) Data matrix | tabular |
174 +---------------------------+------------+
175 | 2) Sample metadata | tabular |
176 +---------------------------+------------+
177 | 3) Variable metadata | tabular |
178 +---------------------------+------------+
179
180
181 ----------
182 Parameters
183 ----------
184
185 Data matrix file
186 | variable x sample **dataMatrix** tabular separated file of the numeric intensities, with . as decimal, and NA for missing values; use the **Check Format** tool in the **LC-MS/Quality Control** section to check the formats of your **dataMatrix**, **sampleMetadata** and **variableMetadata** files
187 |
188
189 Sample metadata file
190 | sample x metadata **sampleMetadata** tabular separated file of the numeric and/or character sample metadata, with . as decimal and NA for missing values; use the **Check Format** tool in the **LC-MS/Quality Control** section to check the formats of your **dataMatrix**, **sampleMetadata** and **variableMetadata** files
191 |
192
193 Variable metadata file
194 | variable x metadata **variableMetadata** tabular separated file of the numeric and/or character variable metadata, with . as decimal and NA for missing values; use the **Check Format** tool in the **LC-MS/Quality Control** section to check the formats of your **dataMatrix**, **sampleMetadata** and **variableMetadata** files
195 |
196
197 Classes of samples
198 | Column of the sample metadata table to be used as the qualitative **binary** response to be modelled; the column should contain only two types of strings (e.g., 'case' and 'control')
199 |
200
201 Advanced: Classification method(s) (default = all)
202 | Either one or all of the following classifiers: Partial Least Squares Discriminant Analysis (PLS-DA), or Random Forest, or Support Vector Machine (SVM)
203 |
204
205 Advanced: Number of bootstraps (default = 50)
206 | This parameter controls the number of times the model performance is compared to the prediction on a test subset where the intensities of the candidate feature have been randomly permuted.
207 |
208
209 Advanced: Selection tier(s) (default = S)
210 | Tier *S* corresponds to the final signature, i.e., features which have been found significant in all the backward selection steps; features with tier *A* have been found significant in all but the last selection, and so on for tier *B* to *E*. Default selection tier is *S*, meaning that the final signature only is returned; to view a larger number of candidate features, the *S+A* tiers can be selected.
211 |
212
213 Advanced: p-value threshold (default = 0.05)
214 | This threshold controls the selection of the features at each selection round (tier): to be selected, the proportion of times the prediction on the test set with the randomized intensities of the feature is more accurate than on the original test set must be inferior to this threshold. For example, if the number of bootstraps is 50, no more than 2 out of the 50 predictions on the randomized test set must not be more accurate than on the original test set (since 1/50 = 0.02).
215
216 Advanced: Seed (default = 0)
217 | As the algorithm relies on resampling (bootstrap), re-running the module may result in slightly different signatures. To ensure that returned results are exactly the same, the **seed** parameter (integer) can be used; the default, 0, means that no seed is used.
218 |
219
220 ------------
221 Output files
222 ------------
223
224 variableMetadata_out.tabular
225 | When a least one feature has been selected, a **tier** column is added indicating for each feature the classifier(s) it was selected from.
226 |
227
228 figure-tier.pdf
229 | Graphic summarizing which features were selected, with their corresponding tier (i.e., round(s) of selection) for each classifier.
230 |
231
232 figure-boxplot.pdf
233 | Individual boxplots of the features which were selected in at least one of the signatures. Features selected for a single classifier are colored (*red* for PLS-DA, *green* for Random Forest, and *blue* for SVM)
234 |
235
236 information.txt
237 | Text file with all messages and warnings generated during the computation.
238 |
239
240 ---------------------------------------------------
241
242 ---------------
243 Working example
244 ---------------
245
246 See the **W4M00003_diaplasma** in the **Shared Data/Published Histories** menu
247
248
249
250 Figure output
251 =============
252
253 .. image:: biosigner_workingExampleImage.png
254 :width: 600
255
256 ---------------------------------------------------
257
258 ----
259 NEWS
260 ----
261
262 CHANGES IN VERSION 2.2.2
263 ========================
264
265 Internal updates to biosigner package versions of 1.0.0 and above, and ropls versions of 1.4.0 and above (i.e. using S4 methods instead of S3)
266
267 CHANGES IN VERSION 2.2.1
268 ========================
269
270 Creation of the tool
271
272 </help>
273
274 <citations/>
275
276 </tool>