Mercurial > repos > malex > secimtools

<tool id="secimtools_svm_classifier" name="Support Vector Machine (SVM) Classifier" version="@WRAPPER_VERSION@">
    <description>- Predict sample groups.</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command><![CDATA[
svm_classifier.py
--train_wide $train_wide
--train_design $train_design
--test_wide $test_wide
--test_design $test_design
--group $group
--ID $uniqID
--kernel $kernel
--degree $degree
--C $C
--cross_validation $cross_validation
--C_lower_bound $C_lower_bound
--C_upper_bound $C_upper_bound
--a $a
--b $b
--outClassification         $outClassification
--outClassificationAccuracy $outClassificationAccuracy
--outPrediction             $outPrediction
--outPredictionAccuracy     $outPredictionAccuracy
    ]]></command>
    <inputs>
        <param name="train_wide" type="data" format="tabular" label="Training wide dataset" help="Dataset missing? See TIP below."/>
        <param name="train_design" type="data" format="tabular" label="Training design file" help="Dataset missing? See TIP below."/>
        <param name="test_wide" type="data" format="tabular" label="Target wide dataset" help="Dataset missing? See TIP below."/>
        <param name="test_design" type="data" format="tabular" label="Target design file" help="Dataset missing? See TIP below."/>
        <param name="group" size="30" type="text" value="" label="Group/Treatment" help="Name of the column in your Training and Target design files that contain group classifications."/>
        <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Training and Target wide datasets that contain unique identifiers."/>
        <param name="kernel" type="select" size="30" display="radio" value="rbf" label="Select a SVM Kernel Function">
            <option value="rbf">Radial Basis function (Gaussian)</option>
            <option value="linear">Linear</option>
            <option value="poly">Polynomial</option>
            <option value="sigmoid">Sigmoid</option>
        </param>
        <param name="degree" size="30" type="text" value="3" label="Polynomial Degree" help='Only used for the polynomial kernel.'/>
        <param name="cross_validation" type="select" size="30" display="radio" value="double" label="Select Cross-Validation">
            <option value="none">None</option>
            <option value="single">Single</option>
            <option value="double">Double</option>
        </param>
        <param name="C" size="30" type="text" value="1" label="Regularization Parameter C" help='See references in tool description for setting this parameter. Value must be positive (C > 0). Used only if cross-validation is not selected. Default = 1.'/>
        <param name="C_lower_bound" size="30" type="text" value="0.1" label="Regularization Parameter C (Lower Bound)" help='Defines the lower bound for regularization parameter C when cross-validation is used. Must have a positive value (C > 0) Default = 0.1. '/>
        <param name="C_upper_bound" size="30" type="text" value="10" label="Regularization Parameter C (Upper Bound)" help='Defines the upper bound for regularization parameter C when cross-validation is used. Must have a positive value that is larger than the lower bound. Default = 10. '/>
        <param name="a" size="30" type="text" value="0.0" label="Coefficient A" help='Used in the kernel functions above.  Must be greater than zero. However, the default = 0 and translates to a = 1/n_features, where n_features is the number of features. Default = 0.'/>
        <param name="b" size="30" type="text" value="0.0" label="Coefficient B" help='Independent term in kernel function. It is only significant in polynomial and sigmoid kernels. Default = 0.'/>
    </inputs>
    <outputs>
        <data name="outClassification" format="tabular" label="${tool.name} on ${on_string}: Classification of the Training Data Set"/>
        <data name="outClassificationAccuracy" format='tabular' label="${tool.name} on ${on_string}: Classification Accuracy of the Training Data Set"/>
        <data name="outPrediction" format="tabular" label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/>
        <data name="outPredictionAccuracy" format='tabular' label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/>
    </outputs>
    <tests>
        <test>
            <param name="train_wide"       value="ST000006_data.tsv"/>
            <param name="train_design"     value="ST000006_design.tsv"/>
            <param name="test_wide"        value="ST000006_data.tsv"/>
            <param name="test_design"      value="ST000006_design.tsv"/>
            <param name="group"            value="White_wine_type_and_source" />
            <param name="uniqID"           value="Retention_Index" />
            <param name="kernel"           value="linear"/>
            <param name="degree"           value="3"/>
            <param name="cross_validation" value="none"/>
            <param name="C"                value="1"/>
            <param name="C_lower_bound"    value="0.1"/>
            <param name="C_upper_bound"    value="2"/>
            <param name="a"                value="1"/>
            <param name="b"                value="1"/>
            <output name="outClassification"         file="ST000006_svm_classifier_train_classification.tsv" />
            <output name="outClassificationAccuracy" file="ST000006_svm_classifier_train_classification_accuracy.tsv" />
            <output name="outPrediction"             file="ST000006_svm_classifier_target_classification.tsv" />
            <output name="outPredictionAccuracy"     file="ST000006_svm_classifier_target_classification_accuracy.tsv" />
        </test>
    </tests>
    <help><![CDATA[

**TIP:**
If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*.

**WARNINGS:**
    - (1) This script automatically removes spaces and special characters from strings.
    - (2) If a feature name starts with a number it will prepend an '_'.

**Tool Description**

**NOTE: A minimum of 100 samples is required by the tool for single or double cross validation**

Given a set of supervised samples in a Training Dataset, the SVM training algorithm builds a model based on these samples that can be used for predicting the categories of new, unclassified samples in a Target Dataset.
The Target Dataset is not used for model training or evaluation, only for prediction based on the finalized model.
SVM classification is performed on the target data and accuracy is estimated for both Target and Training Datasets.

SVM uses different kernel functions to carry out different types of classification such as radial bassis (gaussian), linear, polynomial, and sigmoid.
The classification model can be trained with and without cross-validation (single or double).

For single and double cross-validation: the training dataset is split differently when the model fit is performed.

In single cross-validation: the same data are used to both fit and evaluate the model.

In double cross-validation: the training dataset is split into pieces and the model fit is performed on one of the pieces and evaluated on the other pieces.

Under cross-validation, the user specifies Regularization Parameter C and the Upper and Lower bounds of Regularization Parameter C.
For more information about Regularization Parameter C, see references below:

Cortes, C. and Vapnik, V. 1995.  Support-vector networks. Machine Learning. 20(3) 273-297.

Steinwart, I and Christmann, A.  2008.  Support vector machines. Springer Science and Business Media.


To use the SVM tool, users need the following information:

(i) a Training Dataset with known categories in the training design file and
(ii) a Target Dataset with predicted categories in the target design file.
(iii) the name of the Group/Treatment classification column should be the same for both design files.
(iv) the Unique Feature IDs should be the same in both the wide datasets.
(v) the number of Unique Feature IDs should be the same in both the wide datasets.

------------------------------------------------------------------------------

**Input**

    - Four input datasets are required.

@WIDE@

**NOTE:** The sample IDs must match the sample IDs in the Design File
(below). Extra columns will automatically be ignored.

@METADATA@

@GROUP@

@UNIQID@

**SVM Kernel Function**

    - Kernel functions available for the SVM algorithm.

**Polynomial Degree**

    - Only used for the polynomial kernel.

**Cross-Validation Choice**

    - Cross-validation options available for the user. 'None' corresponds to no cross-validation- the user specifies regularization parameter C manually.


**Regularization Parameter C**

    - Penalizes potential overfitting, must be positive.


**Regularization Parameter C (Lower Bound)**

    - Lower bound for regularization parameter C. Value must be greater than 0. Only if cross-validation is selected.


**Regularization Parameter C (Upper Bound)**

    - Upper bound for regularization parameter C. Value must be greater than the Lower Bound.


**Coefficient A**

    - Used in the kernel functions above.  Must be greater than zero. Default = 0, however,
      this translates to a = 1/n_features, where n_features is the number of features.

**Coefficent B**

    - Independent term in the kernel function. It is only significant in
      polynomial and sigmoid kernels.

------------------------------------------------------------------------------

**Output**

This tool will output two files for the Training dataset and two for the Target datset:

Training:

(1) a TSV file containing the observed and predicted grouping classifications for each sample and
(2) a TSV file containing the accuracy (percentage) of the classification.

Target:

(3) a TSV file containing suspected and predicted grouping classifications for each sample and
(4) a TSV file containing the accuracy (percentage) of the prediction in comparison to the suspected grouping specified in the design file.

**NOTE:** Some knowledge about the SVM classifier algorithm and different kernel types is recommended for users who plan to use the tool frequently with different settings.

    ]]></help>
    <expand macro="citations"/>
</tool>
author	malex
date	Mon, 08 Mar 2021 22:04:06 +0000
parents
children	caba07f41453