diff pycaret_train.xml @ 0:915447b14520 draft

planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
author goeckslab
date Wed, 11 Dec 2024 05:00:00 +0000
parents
children f6def1b90150
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pycaret_train.xml	Wed Dec 11 05:00:00 2024 +0000
@@ -0,0 +1,209 @@
+<tool id="pycaret_compare" name="PyCaret Model Comparison" version="@VERSION@" profile="@PROFILE@">
+    <description>compares different machine learning models on a dataset using PyCaret. Do feature analyses using Random Forest and LightGBM. </description>
+    <macros>
+        <import>pycaret_macros.xml</import>
+    </macros>
+    <expand macro="python_requirements" />
+    <command>
+        <![CDATA[
+        python $__tool_directory__/pycaret_train.py --input_file $input_file --target_col $target_feature --output_dir "`pwd`" --random_seed $random_seed
+        #if $model_type == "classification"
+            #if $classification_models
+                --models $classification_models
+            #end if
+        #end if
+        #if $model_type == "regression"
+            #if $regression_models
+                --models $regression_models
+            #end if
+        #end if
+        #if $customize_defaults == "true"
+                #if $train_size
+                --train_size $train_size 
+                #end if
+                #if $normalize
+                --normalize  
+                #end if
+                #if $feature_selection
+                --feature_selection
+                #end if
+                #if $enable_cross_validation == "true" 
+                --cross_validation 
+                #end if
+                #if $cross_validation_folds
+                --cross_validation_folds $cross_validation_folds 
+                #end if
+                #if $remove_outliers
+                --remove_outliers  
+                #end if
+                #if $remove_multicollinearity
+                --remove_multicollinearity 
+                #end if
+                #if $polynomial_features
+                --polynomial_features  
+                #end if
+                #if $fix_imbalance
+                --fix_imbalance 
+                #end if
+        #end if
+        #if $test_file
+            --test_file $test_file 
+        #end if 
+        --model_type $model_type    
+        ]]>
+    </command>
+    <inputs>
+        <param name="input_file" type="data" format="csv,tabular" label="Train Dataset (CSV or TSV)" />
+        <param name="test_file" type="data" format="csv,tabular" optional="true" label="Test Dataset (CSV or TSV)"
+        help="If a test set is not provided, 
+        the selected training set will be split into training, validation, and test sets. 
+        If a test set is provided, the training set will only be split into training and validation sets. 
+        BTW, cross-validation is always applied by default." />
+       <param name="target_feature" multiple="false" type="data_column" use_header_names="true" data_ref="input_file" label="Select the target column:" />
+        <conditional name="model_selection">
+            <param name="model_type" type="select" label="Task">
+                <option value="classification">classification</option>
+                <option value="regression">regression</option>
+            </param>
+            <when value="classification">
+                <param name="classification_models" type="select" multiple="true" label="Only Select Classification Models if you don't want to compare all models">
+                    <option value="lr">Logistic Regression</option>
+                    <option value="knn">K Neighbors Classifier</option>
+                    <option value="nb">Naive Bayes</option>
+                    <option value="dt">Decision Tree Classifier</option>
+                    <option value="svm">SVM - Linear Kernel</option>
+                    <option value="rbfsvm">SVM - Radial Kernel</option>
+                    <option value="gpc">Gaussian Process Classifier</option>
+                    <option value="mlp">MLP Classifier</option>
+                    <option value="ridge">Ridge Classifier</option>
+                    <option value="rf">Random Forest Classifier</option>
+                    <option value="qda">Quadratic Discriminant Analysis</option>
+                    <option value="ada">Ada Boost Classifier</option>
+                    <option value="gbc">Gradient Boosting Classifier</option>
+                    <option value="lda">Linear Discriminant Analysis</option>
+                    <option value="et">Extra Trees Classifier</option>
+                    <option value="xgboost">Extreme Gradient Boosting</option>
+                    <option value="lightgbm">Light Gradient Boosting Machine</option>
+                    <option value="catboost">CatBoost Classifier</option>
+                </param>
+            </when>
+            <when value="regression">
+                <param name="regression_models" type="select" multiple="true" label="Only Select Regression Models if you don't want to compare all models">
+                    <option value="lr">Linear Regression</option>
+                    <option value="lasso">Lasso Regression</option>
+                    <option value="ridge">Ridge Regression</option>
+                    <option value="en">Elastic Net</option>
+                    <option value="lar">Least Angle Regression</option>
+                    <option value="llar">Lasso Least Angle Regression</option>
+                    <option value="omp">Orthogonal Matching Pursuit</option>
+                    <option value="br">Bayesian Ridge</option>
+                    <option value="ard">Automatic Relevance Determination</option>
+                    <option value="par">Passive Aggressive Regressor</option>
+                    <option value="ransac">Random Sample Consensus</option>
+                    <option value="tr">TheilSen Regressor</option>
+                    <option value="huber">Huber Regressor</option>
+                    <option value="kr">Kernel Ridge</option>
+                    <option value="svm">Support Vector Regression</option>
+                    <option value="knn">K Neighbors Regressor</option>
+                    <option value="dt">Decision Tree Regressor</option>
+                    <option value="rf">Random Forest Regressor</option>
+                    <option value="et">Extra Trees Regressor</option>
+                    <option value="ada">AdaBoost Regressor</option>
+                    <option value="gbr">Gradient Boosting Regressor</option>
+                    <option value="mlp">MLP Regressor</option>
+                    <option value="xgboost">Extreme Gradient Boosting</option>
+                    <option value="lightgbm">Light Gradient Boosting Machine</option>
+                    <option value="catboost">CatBoost Regressor</option>
+                </param>
+            </when>
+        </conditional>
+        <param name="random_seed" type="integer" value="42" label="Random Seed" help="Random seed for reproducibility." />
+        <conditional name="advanced_settings">
+            <param name="customize_defaults" type="select" label="Customize Default Settings?" help="Select yes if you want to customize the default settings of the experiment.">
+                <option value="false" selected="true">No</option>
+                <option value="true">Yes</option>
+            </param>
+            <when value="true">
+                <param name="train_size" type="float" value="0.7" min="0.1" max="0.9" label="Train Size" help="Proportion of the dataset to include in the train split." />
+                <param name="normalize" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Normalize Data" help="Whether to normalize data before training." />
+                <param name="feature_selection" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Feature Selection" help="Whether to perform feature selection." />
+                <conditional name="cross_validation">
+                    <param name="enable_cross_validation" type="select" label="Enable Cross Validation?" help="Select whether to enable cross-validation. Default: Yes" >
+                        <option value="false" >No</option>
+                        <option value="true" selected="true">Yes</option>
+                    </param>
+                    <when value="true">
+                        <param name="cross_validation_folds" type="integer" value="10" min="2" max="20" label="Cross Validation Folds" help="Number of folds to use for cross-validation. Default: 10" />
+                    </when>
+                    <when value="false">
+                        <!-- No additional parameters to show if the user selects 'No' -->
+                    </when>
+                </conditional>
+                <param name="remove_outliers" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Remove Outliers" help="Whether to remove outliers from the dataset before training. Default: False" />
+                <param name="remove_multicollinearity" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Remove Multicollinearity" help="Whether to remove multicollinear features before training. Default: False" />
+                <param name="polynomial_features" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Polynomial Features" help="Whether to create polynomial features before training. Default: False" />
+                <param name="fix_imbalance" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Fix Imbalance" help="ONLY for classfication! Whether to use SMOTE or similar methods to fix imbalance in the dataset. Default: False" />
+            </when>
+            <when value="false">
+                <!-- No additional parameters to show if the user selects 'No' -->
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="model" format="h5" from_work_dir="pycaret_model.h5" label="${tool.name} best model on ${on_string}" />
+        <data name="comparison_result" format="html" from_work_dir="comparison_result.html" label="${tool.name} Comparison result on ${on_string}"/>
+        <data name="best_model_csv" format="csv" from_work_dir="best_model.csv" label="${tool.name} The prams of the best model on ${on_string}" hidden="true" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" value="pcr.tsv"/>
+            <param name="target_feature" value="11"/> 
+            <param name="model_type" value="classification"/>
+            <param name="random_seed" value="42"/>
+            <param name="customize_defaults" value="true"/>
+            <param name="train_size" value="0.8"/>
+            <param name="normalize" value="true"/>
+            <param name="feature_selection" value="true"/>
+            <param name="enable_cross_validation" value="true"/>
+            <param name="cross_validation_folds" value="5"/>
+            <param name="remove_outliers" value="true"/>
+            <param name="remove_multicollinearity" value="true"/>
+            <output name="model" file="expected_model_classification_customized.h5" compare="sim_size"/>
+            <output name="comparison_result" file="expected_comparison_result_classification_customized.html" compare="sim_size" /> 
+            <output name="best_model_csv" value="expected_best_model_classification_customized.csv" />
+        </test>
+        <test>
+            <param name="input_file" value="pcr.tsv"/>
+            <param name="target_feature" value="11"/> 
+            <param name="model_type" value="classification"/>
+            <param name="random_seed" value="42"/>
+            <output name="model" file="expected_model_classification.h5" compare="sim_size"/>
+            <output name="comparison_result" file="expected_comparison_result_classification.html" compare="sim_size" /> 
+            <output name="best_model_csv" value="expected_best_model_classification.csv" />
+        </test>
+        <test>
+            <param name="input_file" value="auto-mpg.tsv"/>
+            <param name="target_feature" value="1"/> 
+            <param name="model_type" value="regression"/>
+            <param name="random_seed" value="42"/>
+            <output name="model" file="expected_model_regression.h5" compare="sim_size" />
+            <output name="comparison_result" file="expected_comparison_result_regression.html" compare="sim_size" /> 
+            <output name="best_model_csv" value="expected_best_model_regression.csv" />
+        </test>
+    </tests>
+    <help>
+        This tool uses PyCaret to train and evaluate machine learning models.
+        It compares different models on a dataset and provides the best model based on the performance metrics.
+
+        **Outputs**
+
+        - **Model**: The best model trained on the dataset in h5 format.
+
+
+        - **Comparison Result**: The comparison result of different models in html format. 
+            It contains the performance metrics of different models, plots of the best model 
+            on the testing set (or part of the training set if a separate test set is not uploaded), and feature analysis plots.
+
+    </help>
+    <expand macro="macro_citations" />
+</tool>
\ No newline at end of file