comparison svm_classifier.xml @ 1:2e7d47c0b027 draft

"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author malex
date Mon, 08 Mar 2021 22:04:06 +0000
parents
children caba07f41453
comparison
equal deleted inserted replaced
0:b54326490b4d 1:2e7d47c0b027
1 <tool id="secimtools_svm_classifier" name="Support Vector Machine (SVM) Classifier" version="@WRAPPER_VERSION@">
2 <description>- Predict sample groups.</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <command><![CDATA[
8 svm_classifier.py
9 --train_wide $train_wide
10 --train_design $train_design
11 --test_wide $test_wide
12 --test_design $test_design
13 --group $group
14 --ID $uniqID
15 --kernel $kernel
16 --degree $degree
17 --C $C
18 --cross_validation $cross_validation
19 --C_lower_bound $C_lower_bound
20 --C_upper_bound $C_upper_bound
21 --a $a
22 --b $b
23 --outClassification $outClassification
24 --outClassificationAccuracy $outClassificationAccuracy
25 --outPrediction $outPrediction
26 --outPredictionAccuracy $outPredictionAccuracy
27 ]]></command>
28 <inputs>
29 <param name="train_wide" type="data" format="tabular" label="Training wide dataset" help="Dataset missing? See TIP below."/>
30 <param name="train_design" type="data" format="tabular" label="Training design file" help="Dataset missing? See TIP below."/>
31 <param name="test_wide" type="data" format="tabular" label="Target wide dataset" help="Dataset missing? See TIP below."/>
32 <param name="test_design" type="data" format="tabular" label="Target design file" help="Dataset missing? See TIP below."/>
33 <param name="group" size="30" type="text" value="" label="Group/Treatment" help="Name of the column in your Training and Target design files that contain group classifications."/>
34 <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Training and Target wide datasets that contain unique identifiers."/>
35 <param name="kernel" type="select" size="30" display="radio" value="rbf" label="Select a SVM Kernel Function">
36 <option value="rbf">Radial Basis function (Gaussian)</option>
37 <option value="linear">Linear</option>
38 <option value="poly">Polynomial</option>
39 <option value="sigmoid">Sigmoid</option>
40 </param>
41 <param name="degree" size="30" type="text" value="3" label="Polynomial Degree" help='Only used for the polynomial kernel.'/>
42 <param name="cross_validation" type="select" size="30" display="radio" value="double" label="Select Cross-Validation">
43 <option value="none">None</option>
44 <option value="single">Single</option>
45 <option value="double">Double</option>
46 </param>
47 <param name="C" size="30" type="text" value="1" label="Regularization Parameter C" help='See references in tool description for setting this parameter. Value must be positive (C > 0). Used only if cross-validation is not selected. Default = 1.'/>
48 <param name="C_lower_bound" size="30" type="text" value="0.1" label="Regularization Parameter C (Lower Bound)" help='Defines the lower bound for regularization parameter C when cross-validation is used. Must have a positive value (C > 0) Default = 0.1. '/>
49 <param name="C_upper_bound" size="30" type="text" value="10" label="Regularization Parameter C (Upper Bound)" help='Defines the upper bound for regularization parameter C when cross-validation is used. Must have a positive value that is larger than the lower bound. Default = 10. '/>
50 <param name="a" size="30" type="text" value="0.0" label="Coefficient A" help='Used in the kernel functions above. Must be greater than zero. However, the default = 0 and translates to a = 1/n_features, where n_features is the number of features. Default = 0.'/>
51 <param name="b" size="30" type="text" value="0.0" label="Coefficient B" help='Independent term in kernel function. It is only significant in polynomial and sigmoid kernels. Default = 0.'/>
52 </inputs>
53 <outputs>
54 <data name="outClassification" format="tabular" label="${tool.name} on ${on_string}: Classification of the Training Data Set"/>
55 <data name="outClassificationAccuracy" format='tabular' label="${tool.name} on ${on_string}: Classification Accuracy of the Training Data Set"/>
56 <data name="outPrediction" format="tabular" label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/>
57 <data name="outPredictionAccuracy" format='tabular' label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/>
58 </outputs>
59 <tests>
60 <test>
61 <param name="train_wide" value="ST000006_data.tsv"/>
62 <param name="train_design" value="ST000006_design.tsv"/>
63 <param name="test_wide" value="ST000006_data.tsv"/>
64 <param name="test_design" value="ST000006_design.tsv"/>
65 <param name="group" value="White_wine_type_and_source" />
66 <param name="uniqID" value="Retention_Index" />
67 <param name="kernel" value="linear"/>
68 <param name="degree" value="3"/>
69 <param name="cross_validation" value="none"/>
70 <param name="C" value="1"/>
71 <param name="C_lower_bound" value="0.1"/>
72 <param name="C_upper_bound" value="2"/>
73 <param name="a" value="1"/>
74 <param name="b" value="1"/>
75 <output name="outClassification" file="ST000006_svm_classifier_train_classification.tsv" />
76 <output name="outClassificationAccuracy" file="ST000006_svm_classifier_train_classification_accuracy.tsv" />
77 <output name="outPrediction" file="ST000006_svm_classifier_target_classification.tsv" />
78 <output name="outPredictionAccuracy" file="ST000006_svm_classifier_target_classification_accuracy.tsv" />
79 </test>
80 </tests>
81 <help><![CDATA[
82
83 **TIP:**
84 If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*.
85
86 **WARNINGS:**
87 - (1) This script automatically removes spaces and special characters from strings.
88 - (2) If a feature name starts with a number it will prepend an '_'.
89
90 **Tool Description**
91
92 **NOTE: A minimum of 100 samples is required by the tool for single or double cross validation**
93
94 Given a set of supervised samples in a Training Dataset, the SVM training algorithm builds a model based on these samples that can be used for predicting the categories of new, unclassified samples in a Target Dataset.
95 The Target Dataset is not used for model training or evaluation, only for prediction based on the finalized model.
96 SVM classification is performed on the target data and accuracy is estimated for both Target and Training Datasets.
97
98 SVM uses different kernel functions to carry out different types of classification such as radial bassis (gaussian), linear, polynomial, and sigmoid.
99 The classification model can be trained with and without cross-validation (single or double).
100
101 For single and double cross-validation: the training dataset is split differently when the model fit is performed.
102
103 In single cross-validation: the same data are used to both fit and evaluate the model.
104
105 In double cross-validation: the training dataset is split into pieces and the model fit is performed on one of the pieces and evaluated on the other pieces.
106
107 Under cross-validation, the user specifies Regularization Parameter C and the Upper and Lower bounds of Regularization Parameter C.
108 For more information about Regularization Parameter C, see references below:
109
110 Cortes, C. and Vapnik, V. 1995. Support-vector networks. Machine Learning. 20(3) 273-297.
111
112 Steinwart, I and Christmann, A. 2008. Support vector machines. Springer Science and Business Media.
113
114
115 To use the SVM tool, users need the following information:
116
117 (i) a Training Dataset with known categories in the training design file and
118 (ii) a Target Dataset with predicted categories in the target design file.
119 (iii) the name of the Group/Treatment classification column should be the same for both design files.
120 (iv) the Unique Feature IDs should be the same in both the wide datasets.
121 (v) the number of Unique Feature IDs should be the same in both the wide datasets.
122
123 ------------------------------------------------------------------------------
124
125 **Input**
126
127 - Four input datasets are required.
128
129 @WIDE@
130
131 **NOTE:** The sample IDs must match the sample IDs in the Design File
132 (below). Extra columns will automatically be ignored.
133
134 @METADATA@
135
136 @GROUP@
137
138 @UNIQID@
139
140 **SVM Kernel Function**
141
142 - Kernel functions available for the SVM algorithm.
143
144 **Polynomial Degree**
145
146 - Only used for the polynomial kernel.
147
148 **Cross-Validation Choice**
149
150 - Cross-validation options available for the user. 'None' corresponds to no cross-validation- the user specifies regularization parameter C manually.
151
152
153 **Regularization Parameter C**
154
155 - Penalizes potential overfitting, must be positive.
156
157
158 **Regularization Parameter C (Lower Bound)**
159
160 - Lower bound for regularization parameter C. Value must be greater than 0. Only if cross-validation is selected.
161
162
163 **Regularization Parameter C (Upper Bound)**
164
165 - Upper bound for regularization parameter C. Value must be greater than the Lower Bound.
166
167
168 **Coefficient A**
169
170 - Used in the kernel functions above. Must be greater than zero. Default = 0, however,
171 this translates to a = 1/n_features, where n_features is the number of features.
172
173 **Coefficent B**
174
175 - Independent term in the kernel function. It is only significant in
176 polynomial and sigmoid kernels.
177
178 ------------------------------------------------------------------------------
179
180 **Output**
181
182 This tool will output two files for the Training dataset and two for the Target datset:
183
184 Training:
185
186 (1) a TSV file containing the observed and predicted grouping classifications for each sample and
187 (2) a TSV file containing the accuracy (percentage) of the classification.
188
189 Target:
190
191 (3) a TSV file containing suspected and predicted grouping classifications for each sample and
192 (4) a TSV file containing the accuracy (percentage) of the prediction in comparison to the suspected grouping specified in the design file.
193
194 **NOTE:** Some knowledge about the SVM classifier algorithm and different kernel types is recommended for users who plan to use the tool frequently with different settings.
195
196 ]]></help>
197 <expand macro="citations"/>
198 </tool>