view pycaret_train.xml @ 0:915447b14520 draft

planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
author goeckslab
date Wed, 11 Dec 2024 05:00:00 +0000
parents
children f6def1b90150
line wrap: on
line source

<tool id="pycaret_compare" name="PyCaret Model Comparison" version="@VERSION@" profile="@PROFILE@">
    <description>compares different machine learning models on a dataset using PyCaret. Do feature analyses using Random Forest and LightGBM. </description>
    <macros>
        <import>pycaret_macros.xml</import>
    </macros>
    <expand macro="python_requirements" />
    <command>
        <![CDATA[
        python $__tool_directory__/pycaret_train.py --input_file $input_file --target_col $target_feature --output_dir "`pwd`" --random_seed $random_seed
        #if $model_type == "classification"
            #if $classification_models
                --models $classification_models
            #end if
        #end if
        #if $model_type == "regression"
            #if $regression_models
                --models $regression_models
            #end if
        #end if
        #if $customize_defaults == "true"
                #if $train_size
                --train_size $train_size 
                #end if
                #if $normalize
                --normalize  
                #end if
                #if $feature_selection
                --feature_selection
                #end if
                #if $enable_cross_validation == "true" 
                --cross_validation 
                #end if
                #if $cross_validation_folds
                --cross_validation_folds $cross_validation_folds 
                #end if
                #if $remove_outliers
                --remove_outliers  
                #end if
                #if $remove_multicollinearity
                --remove_multicollinearity 
                #end if
                #if $polynomial_features
                --polynomial_features  
                #end if
                #if $fix_imbalance
                --fix_imbalance 
                #end if
        #end if
        #if $test_file
            --test_file $test_file 
        #end if 
        --model_type $model_type    
        ]]>
    </command>
    <inputs>
        <param name="input_file" type="data" format="csv,tabular" label="Train Dataset (CSV or TSV)" />
        <param name="test_file" type="data" format="csv,tabular" optional="true" label="Test Dataset (CSV or TSV)"
        help="If a test set is not provided, 
        the selected training set will be split into training, validation, and test sets. 
        If a test set is provided, the training set will only be split into training and validation sets. 
        BTW, cross-validation is always applied by default." />
       <param name="target_feature" multiple="false" type="data_column" use_header_names="true" data_ref="input_file" label="Select the target column:" />
        <conditional name="model_selection">
            <param name="model_type" type="select" label="Task">
                <option value="classification">classification</option>
                <option value="regression">regression</option>
            </param>
            <when value="classification">
                <param name="classification_models" type="select" multiple="true" label="Only Select Classification Models if you don't want to compare all models">
                    <option value="lr">Logistic Regression</option>
                    <option value="knn">K Neighbors Classifier</option>
                    <option value="nb">Naive Bayes</option>
                    <option value="dt">Decision Tree Classifier</option>
                    <option value="svm">SVM - Linear Kernel</option>
                    <option value="rbfsvm">SVM - Radial Kernel</option>
                    <option value="gpc">Gaussian Process Classifier</option>
                    <option value="mlp">MLP Classifier</option>
                    <option value="ridge">Ridge Classifier</option>
                    <option value="rf">Random Forest Classifier</option>
                    <option value="qda">Quadratic Discriminant Analysis</option>
                    <option value="ada">Ada Boost Classifier</option>
                    <option value="gbc">Gradient Boosting Classifier</option>
                    <option value="lda">Linear Discriminant Analysis</option>
                    <option value="et">Extra Trees Classifier</option>
                    <option value="xgboost">Extreme Gradient Boosting</option>
                    <option value="lightgbm">Light Gradient Boosting Machine</option>
                    <option value="catboost">CatBoost Classifier</option>
                </param>
            </when>
            <when value="regression">
                <param name="regression_models" type="select" multiple="true" label="Only Select Regression Models if you don't want to compare all models">
                    <option value="lr">Linear Regression</option>
                    <option value="lasso">Lasso Regression</option>
                    <option value="ridge">Ridge Regression</option>
                    <option value="en">Elastic Net</option>
                    <option value="lar">Least Angle Regression</option>
                    <option value="llar">Lasso Least Angle Regression</option>
                    <option value="omp">Orthogonal Matching Pursuit</option>
                    <option value="br">Bayesian Ridge</option>
                    <option value="ard">Automatic Relevance Determination</option>
                    <option value="par">Passive Aggressive Regressor</option>
                    <option value="ransac">Random Sample Consensus</option>
                    <option value="tr">TheilSen Regressor</option>
                    <option value="huber">Huber Regressor</option>
                    <option value="kr">Kernel Ridge</option>
                    <option value="svm">Support Vector Regression</option>
                    <option value="knn">K Neighbors Regressor</option>
                    <option value="dt">Decision Tree Regressor</option>
                    <option value="rf">Random Forest Regressor</option>
                    <option value="et">Extra Trees Regressor</option>
                    <option value="ada">AdaBoost Regressor</option>
                    <option value="gbr">Gradient Boosting Regressor</option>
                    <option value="mlp">MLP Regressor</option>
                    <option value="xgboost">Extreme Gradient Boosting</option>
                    <option value="lightgbm">Light Gradient Boosting Machine</option>
                    <option value="catboost">CatBoost Regressor</option>
                </param>
            </when>
        </conditional>
        <param name="random_seed" type="integer" value="42" label="Random Seed" help="Random seed for reproducibility." />
        <conditional name="advanced_settings">
            <param name="customize_defaults" type="select" label="Customize Default Settings?" help="Select yes if you want to customize the default settings of the experiment.">
                <option value="false" selected="true">No</option>
                <option value="true">Yes</option>
            </param>
            <when value="true">
                <param name="train_size" type="float" value="0.7" min="0.1" max="0.9" label="Train Size" help="Proportion of the dataset to include in the train split." />
                <param name="normalize" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Normalize Data" help="Whether to normalize data before training." />
                <param name="feature_selection" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Feature Selection" help="Whether to perform feature selection." />
                <conditional name="cross_validation">
                    <param name="enable_cross_validation" type="select" label="Enable Cross Validation?" help="Select whether to enable cross-validation. Default: Yes" >
                        <option value="false" >No</option>
                        <option value="true" selected="true">Yes</option>
                    </param>
                    <when value="true">
                        <param name="cross_validation_folds" type="integer" value="10" min="2" max="20" label="Cross Validation Folds" help="Number of folds to use for cross-validation. Default: 10" />
                    </when>
                    <when value="false">
                        <!-- No additional parameters to show if the user selects 'No' -->
                    </when>
                </conditional>
                <param name="remove_outliers" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Remove Outliers" help="Whether to remove outliers from the dataset before training. Default: False" />
                <param name="remove_multicollinearity" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Remove Multicollinearity" help="Whether to remove multicollinear features before training. Default: False" />
                <param name="polynomial_features" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Polynomial Features" help="Whether to create polynomial features before training. Default: False" />
                <param name="fix_imbalance" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Fix Imbalance" help="ONLY for classfication! Whether to use SMOTE or similar methods to fix imbalance in the dataset. Default: False" />
            </when>
            <when value="false">
                <!-- No additional parameters to show if the user selects 'No' -->
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="model" format="h5" from_work_dir="pycaret_model.h5" label="${tool.name} best model on ${on_string}" />
        <data name="comparison_result" format="html" from_work_dir="comparison_result.html" label="${tool.name} Comparison result on ${on_string}"/>
        <data name="best_model_csv" format="csv" from_work_dir="best_model.csv" label="${tool.name} The prams of the best model on ${on_string}" hidden="true" />
    </outputs>
    <tests>
        <test>
            <param name="input_file" value="pcr.tsv"/>
            <param name="target_feature" value="11"/> 
            <param name="model_type" value="classification"/>
            <param name="random_seed" value="42"/>
            <param name="customize_defaults" value="true"/>
            <param name="train_size" value="0.8"/>
            <param name="normalize" value="true"/>
            <param name="feature_selection" value="true"/>
            <param name="enable_cross_validation" value="true"/>
            <param name="cross_validation_folds" value="5"/>
            <param name="remove_outliers" value="true"/>
            <param name="remove_multicollinearity" value="true"/>
            <output name="model" file="expected_model_classification_customized.h5" compare="sim_size"/>
            <output name="comparison_result" file="expected_comparison_result_classification_customized.html" compare="sim_size" /> 
            <output name="best_model_csv" value="expected_best_model_classification_customized.csv" />
        </test>
        <test>
            <param name="input_file" value="pcr.tsv"/>
            <param name="target_feature" value="11"/> 
            <param name="model_type" value="classification"/>
            <param name="random_seed" value="42"/>
            <output name="model" file="expected_model_classification.h5" compare="sim_size"/>
            <output name="comparison_result" file="expected_comparison_result_classification.html" compare="sim_size" /> 
            <output name="best_model_csv" value="expected_best_model_classification.csv" />
        </test>
        <test>
            <param name="input_file" value="auto-mpg.tsv"/>
            <param name="target_feature" value="1"/> 
            <param name="model_type" value="regression"/>
            <param name="random_seed" value="42"/>
            <output name="model" file="expected_model_regression.h5" compare="sim_size" />
            <output name="comparison_result" file="expected_comparison_result_regression.html" compare="sim_size" /> 
            <output name="best_model_csv" value="expected_best_model_regression.csv" />
        </test>
    </tests>
    <help>
        This tool uses PyCaret to train and evaluate machine learning models.
        It compares different models on a dataset and provides the best model based on the performance metrics.

        **Outputs**

        - **Model**: The best model trained on the dataset in h5 format.


        - **Comparison Result**: The comparison result of different models in html format. 
            It contains the performance metrics of different models, plots of the best model 
            on the testing set (or part of the training set if a separate test set is not uploaded), and feature analysis plots.

    </help>
    <expand macro="macro_citations" />
</tool>