Mercurial > repos > iuc > chopin2

<?xml version="1.0"?>
<tool name="chopin2" id="chopin2" version="@TOOL_VERSION@+galaxy@GALAXY_VERSION@" profile="@PROFILE@" license="GPL-3.0-or-later">
    <description>Domain-Agnostic Supervised Learning with Hyperdimensional Computing</description>

    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="creator"/>
    <expand macro="requirements"/>

    <command detect_errors="exit_code">
<![CDATA[
    ln -s '$dataset' '${dataset.element_identifier}' &&

    chopin2

    --dataset '${dataset.element_identifier}'
    #if $dataset.ext == 'csv':
        --fieldsep ','
    #else:
        --fieldsep \$'\t'
    #end if

    --dimensionality ${dimensionality}
    --levels ${levels}
    --retrain ${retrain}
    --stop
    --crossv_k ${folds}

    #if $feature_selection.enable_fs == "true":
        --select_features
        --group_min ${feature_selection.group_min}
        --accuracy_threshold ${feature_selection.accuracy_threshold}
        --accuracy_uncertainty_perc ${feature_selection.accuracy_uncertainty_perc}
    #end if

    --dump
    --cleanup
    --nproc "\${GALAXY_SLOTS:-4}"
    --verbose
]]>
    </command>

    <inputs>
        <param name="dataset" type="data" format="csv,tabular"
               label="Select a dataset"
               help="Input dataset with features on columns and observations on rows. The first column must contain the observation IDs, while the last column must contain classes. The header line is also required." />

        <param name="dimensionality" type="integer" value="10000" min="100"
               label="Vectors dimensionality"
               help="Size of hypervectors is usually 10,000 in vector-symbolic architectures. However, lower values could work
                     with small datasets in terms of number of features and observations. Please note that you may require
                     to increase this number in case of datasets with a huge number of features." />

        <param name="levels" type="integer" value="1000" min="2"
               label="Levels"
               help="Number of level vectors. You may consider to look at the distribution of your data in order to choose
                     the most appropriate value." />

        <param name="retrain" type="integer" value="0" min="0"
               label="Model retraining iterations"
               help="Maximum number of retraining iterations. Class hypervectors are retrained to minimize errors caused by noise." />

        <param name="folds" type="integer" value="2" min="2"
               label="Number of folds for cross-validation"
               help="This tool makes use of k-folds cross-validation to evaluate the accuracy of the hyperdimensional model.
                     Make sure to choose a good number of folds for validating the classification model. Please note that higher number
                     of folds could significantly increase the running time." />

        <conditional name="feature_selection">
            <param name="enable_fs" type="select"
                   label="Enable feature selection"
                   help="If selected, this will extract a set of features with the better discriminative power among classes.
                         The feature selection algorithm is defined as a backward variable selection method.">
                <option value="false" selected="true">Disabled</option>
                <option value="true">Enabled</option>
            </param>

            <when value="false" />

            <when value="true">
                <param name="group_min" type="integer" value="1" min="1"
                       label="Minimum number of selected features"
                       help="Tool will stop removing features if its number will reach this value." />

                <param name="accuracy_threshold" type="float" value="60.0" min="0.0" max="100.0"
                       label="Accuracy threshold"
                       help="Stop the execution if the best accuracy reached for a group of features is lower than this value." />

                <param name="accuracy_uncertainty_perc" type="float" value="5.0" min="0.0" max="100.0"
                       label="Accuracy uncertainty percentage"
                       help="Consider non optimal solutions if model accuracy is greater than the best accuracy minus this percentage." />
            </when>
        </conditional>
    </inputs>

    <outputs>
        <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary" from_work_dir="summary.txt">
            <actions>
                <action name="column_names" type="metadata" default="Run ID,Group Size,Retraining,Accuracy,Excluded Feature" />
                <action name="column_types" type="metadata" default="str,int,int,float,str" />
                <action name="comment_lines" type="metadata" default="7" />
            </actions>
        </data>

        <data format="tabular" name="selection" label="${tool.name} on ${on_string}: Selection" from_work_dir="selection.txt">
            <filter>feature_selection["enable_fs"]</filter>
            <actions>
                <action name="column_names" type="metadata" default="Selected Features:" />
                <action name="column_types" type="metadata" default="str" />
                <action name="comment_lines" type="metadata" default="3" />
            </actions>
        </data>
    </outputs>

    <tests>
        <test>
            <param name="dataset" value="iris.csv" />
            <param name="dimensionality" value="1000" />
            <param name="levels" value="100" />
            <param name="retrain" value="10" />
            <param name="folds" value="5" />

            <output name="summary" ftype="tabular" value="summary.txt">
                <assert_contents>
                    <has_text_matching expression="# Run ID\tGroup Size\tRetraining\tAccuracy"/>
                    <has_text text="8f0e142ff27db7f8d2cc66cfcc05e27c" />
                </assert_contents>
            </output>
        </test>

        <test>
            <param name="dataset" value="iris.tabular" />
            <param name="dimensionality" value="1000" />
            <param name="levels" value="100" />
            <param name="retrain" value="10" />
            <param name="folds" value="5" />

            <output name="summary" ftype="tabular" value="summary.txt">
                <assert_contents>
                    <has_text_matching expression="# Run ID\tGroup Size\tRetraining\tAccuracy"/>
                    <has_text text="8f0e142ff27db7f8d2cc66cfcc05e27c" />
                </assert_contents>
            </output>
        </test>

        <test>
            <param name="dataset" value="iris.csv" />
            <param name="dimensionality" value="1000" />
            <param name="levels" value="100" />
            <param name="retrain" value="10" />
            <param name="folds" value="5" />

            <conditional name="feature_selection">
                <param name="enable_fs" value="true" />
                <param name="group_min" value="1" />
                <param name="accuracy_threshold" value="60.0" />
                <param name="accuracy_uncertainty_perc" value="5.0" />
            </conditional>

            <output name="summary" ftype="tabular" value="summary.txt">
                <assert_contents>
                    <has_text_matching expression="# Run ID\tGroup Size\tRetraining\tAccuracy" />
                    <has_text text="8f0e142ff27db7f8d2cc66cfcc05e27c" />
                </assert_contents>
            </output>

            <output name="selection" ftype="tabular" value="selection.txt">
                <assert_contents>
                    <has_text text="# Selected Features:" />
                    <has_text text="PetalLengthCm" />
                    <has_text text="PetalWidthCm" />
                    <has_text text="SepalLengthCm" />
                    <has_text text="SepalWidthCm" />
                </assert_contents>
            </output>
        </test>
    </tests>

    <help><![CDATA[
chopin2 is a domain-agnostic supervised learning classifier built according to the Hyperdimensional Computing paradigm.
It also implements a feature selection method based on the backward variable elimination strategy.

-----

**Input**

The input is a CSV file representing a matrix with the observations on the rows and features on columns.
The first column must contain the observation IDs, while the last column contains the classes.
Also, the first line must contain the header with the column names.

The tool doesn't support datasets with missing values. It also supports numerical datasets only.
Please note that categorical values are allowed under the first and last columns.

-----

**Output**

The output is a summary table with information about the accuracy of the hyperdimensional model and
the number of retraining iterations that were required to achieve that level of accuracy.

In case the feature selection is enabled, it also returns a file with the list of selected features
that come out from the hyperdimensional classification model with the best accuracy.

-----

.. class:: infomark

**Notes**

Please visit the official GitHub repository_ for other information about `chopin2`.

.. _repository: https://github.com/cumbof/chopin2
    ]]></help>

    <expand macro="citations"/>
</tool>
author	iuc
date	Tue, 31 Jan 2023 16:31:19 +0000
parents
children	693bfd012601