diff svm_classifier.xml @ 1:2e7d47c0b027 draft

"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author malex
date Mon, 08 Mar 2021 22:04:06 +0000
parents
children caba07f41453
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/svm_classifier.xml	Mon Mar 08 22:04:06 2021 +0000
@@ -0,0 +1,198 @@
+<tool id="secimtools_svm_classifier" name="Support Vector Machine (SVM) Classifier" version="@WRAPPER_VERSION@">
+    <description>- Predict sample groups.</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command><![CDATA[
+svm_classifier.py
+--train_wide $train_wide
+--train_design $train_design
+--test_wide $test_wide
+--test_design $test_design
+--group $group
+--ID $uniqID
+--kernel $kernel
+--degree $degree
+--C $C
+--cross_validation $cross_validation
+--C_lower_bound $C_lower_bound
+--C_upper_bound $C_upper_bound
+--a $a
+--b $b
+--outClassification         $outClassification
+--outClassificationAccuracy $outClassificationAccuracy
+--outPrediction             $outPrediction
+--outPredictionAccuracy     $outPredictionAccuracy
+    ]]></command>
+    <inputs>
+        <param name="train_wide" type="data" format="tabular" label="Training wide dataset" help="Dataset missing? See TIP below."/>
+        <param name="train_design" type="data" format="tabular" label="Training design file" help="Dataset missing? See TIP below."/>
+        <param name="test_wide" type="data" format="tabular" label="Target wide dataset" help="Dataset missing? See TIP below."/>
+        <param name="test_design" type="data" format="tabular" label="Target design file" help="Dataset missing? See TIP below."/>
+        <param name="group" size="30" type="text" value="" label="Group/Treatment" help="Name of the column in your Training and Target design files that contain group classifications."/>
+        <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Training and Target wide datasets that contain unique identifiers."/>
+        <param name="kernel" type="select" size="30" display="radio" value="rbf" label="Select a SVM Kernel Function">
+            <option value="rbf">Radial Basis function (Gaussian)</option>
+            <option value="linear">Linear</option>
+            <option value="poly">Polynomial</option>
+            <option value="sigmoid">Sigmoid</option>
+        </param>
+        <param name="degree" size="30" type="text" value="3" label="Polynomial Degree" help='Only used for the polynomial kernel.'/>
+        <param name="cross_validation" type="select" size="30" display="radio" value="double" label="Select Cross-Validation">
+            <option value="none">None</option>
+            <option value="single">Single</option>
+            <option value="double">Double</option>
+        </param>
+        <param name="C" size="30" type="text" value="1" label="Regularization Parameter C" help='See references in tool description for setting this parameter. Value must be positive (C > 0). Used only if cross-validation is not selected. Default = 1.'/>
+        <param name="C_lower_bound" size="30" type="text" value="0.1" label="Regularization Parameter C (Lower Bound)" help='Defines the lower bound for regularization parameter C when cross-validation is used. Must have a positive value (C > 0) Default = 0.1. '/>
+        <param name="C_upper_bound" size="30" type="text" value="10" label="Regularization Parameter C (Upper Bound)" help='Defines the upper bound for regularization parameter C when cross-validation is used. Must have a positive value that is larger than the lower bound. Default = 10. '/>
+        <param name="a" size="30" type="text" value="0.0" label="Coefficient A" help='Used in the kernel functions above.  Must be greater than zero. However, the default = 0 and translates to a = 1/n_features, where n_features is the number of features. Default = 0.'/>
+        <param name="b" size="30" type="text" value="0.0" label="Coefficient B" help='Independent term in kernel function. It is only significant in polynomial and sigmoid kernels. Default = 0.'/>
+    </inputs>
+    <outputs>
+        <data name="outClassification" format="tabular" label="${tool.name} on ${on_string}: Classification of the Training Data Set"/>
+        <data name="outClassificationAccuracy" format='tabular' label="${tool.name} on ${on_string}: Classification Accuracy of the Training Data Set"/>
+        <data name="outPrediction" format="tabular" label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/>
+        <data name="outPredictionAccuracy" format='tabular' label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="train_wide"       value="ST000006_data.tsv"/>
+            <param name="train_design"     value="ST000006_design.tsv"/>
+            <param name="test_wide"        value="ST000006_data.tsv"/>
+            <param name="test_design"      value="ST000006_design.tsv"/>
+            <param name="group"            value="White_wine_type_and_source" />
+            <param name="uniqID"           value="Retention_Index" />
+            <param name="kernel"           value="linear"/>
+            <param name="degree"           value="3"/>
+            <param name="cross_validation" value="none"/>
+            <param name="C"                value="1"/>
+            <param name="C_lower_bound"    value="0.1"/>
+            <param name="C_upper_bound"    value="2"/>
+            <param name="a"                value="1"/>
+            <param name="b"                value="1"/>
+            <output name="outClassification"         file="ST000006_svm_classifier_train_classification.tsv" />
+            <output name="outClassificationAccuracy" file="ST000006_svm_classifier_train_classification_accuracy.tsv" />
+            <output name="outPrediction"             file="ST000006_svm_classifier_target_classification.tsv" />
+            <output name="outPredictionAccuracy"     file="ST000006_svm_classifier_target_classification_accuracy.tsv" />
+        </test>
+    </tests>
+    <help><![CDATA[
+
+**TIP:**
+If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*.
+
+**WARNINGS:**
+    - (1) This script automatically removes spaces and special characters from strings.
+    - (2) If a feature name starts with a number it will prepend an '_'.
+
+**Tool Description**
+
+**NOTE: A minimum of 100 samples is required by the tool for single or double cross validation**
+
+Given a set of supervised samples in a Training Dataset, the SVM training algorithm builds a model based on these samples that can be used for predicting the categories of new, unclassified samples in a Target Dataset.
+The Target Dataset is not used for model training or evaluation, only for prediction based on the finalized model.
+SVM classification is performed on the target data and accuracy is estimated for both Target and Training Datasets.
+
+SVM uses different kernel functions to carry out different types of classification such as radial bassis (gaussian), linear, polynomial, and sigmoid.
+The classification model can be trained with and without cross-validation (single or double).
+
+For single and double cross-validation: the training dataset is split differently when the model fit is performed.
+
+In single cross-validation: the same data are used to both fit and evaluate the model.
+
+In double cross-validation: the training dataset is split into pieces and the model fit is performed on one of the pieces and evaluated on the other pieces.
+
+Under cross-validation, the user specifies Regularization Parameter C and the Upper and Lower bounds of Regularization Parameter C.
+For more information about Regularization Parameter C, see references below:
+
+Cortes, C. and Vapnik, V. 1995.  Support-vector networks. Machine Learning. 20(3) 273-297.
+
+Steinwart, I and Christmann, A.  2008.  Support vector machines. Springer Science and Business Media.
+
+
+To use the SVM tool, users need the following information:
+
+(i) a Training Dataset with known categories in the training design file and
+(ii) a Target Dataset with predicted categories in the target design file.
+(iii) the name of the Group/Treatment classification column should be the same for both design files.
+(iv) the Unique Feature IDs should be the same in both the wide datasets.
+(v) the number of Unique Feature IDs should be the same in both the wide datasets.
+
+------------------------------------------------------------------------------
+
+**Input**
+
+    - Four input datasets are required.
+
+@WIDE@
+
+**NOTE:** The sample IDs must match the sample IDs in the Design File
+(below). Extra columns will automatically be ignored.
+
+@METADATA@
+
+@GROUP@
+
+@UNIQID@
+
+**SVM Kernel Function**
+
+    - Kernel functions available for the SVM algorithm.
+
+**Polynomial Degree**
+
+    - Only used for the polynomial kernel.
+
+**Cross-Validation Choice**
+
+    - Cross-validation options available for the user. 'None' corresponds to no cross-validation- the user specifies regularization parameter C manually.
+
+
+**Regularization Parameter C**
+
+    - Penalizes potential overfitting, must be positive.
+
+
+**Regularization Parameter C (Lower Bound)**
+
+    - Lower bound for regularization parameter C. Value must be greater than 0. Only if cross-validation is selected.
+
+
+**Regularization Parameter C (Upper Bound)**
+
+    - Upper bound for regularization parameter C. Value must be greater than the Lower Bound.
+
+
+**Coefficient A**
+
+    - Used in the kernel functions above.  Must be greater than zero. Default = 0, however,
+      this translates to a = 1/n_features, where n_features is the number of features.
+
+**Coefficent B**
+
+    - Independent term in the kernel function. It is only significant in
+      polynomial and sigmoid kernels.
+
+------------------------------------------------------------------------------
+
+**Output**
+
+This tool will output two files for the Training dataset and two for the Target datset:
+
+Training:
+
+(1) a TSV file containing the observed and predicted grouping classifications for each sample and
+(2) a TSV file containing the accuracy (percentage) of the classification.
+
+Target:
+
+(3) a TSV file containing suspected and predicted grouping classifications for each sample and
+(4) a TSV file containing the accuracy (percentage) of the prediction in comparison to the suspected grouping specified in the design file.
+
+**NOTE:** Some knowledge about the SVM classifier algorithm and different kernel types is recommended for users who plan to use the tool frequently with different settings.
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>