Mercurial > repos > goeckslab > pycaret_compare
diff pycaret_train.xml @ 0:915447b14520 draft
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
author | goeckslab |
---|---|
date | Wed, 11 Dec 2024 05:00:00 +0000 |
parents | |
children | f6def1b90150 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pycaret_train.xml Wed Dec 11 05:00:00 2024 +0000 @@ -0,0 +1,209 @@ +<tool id="pycaret_compare" name="PyCaret Model Comparison" version="@VERSION@" profile="@PROFILE@"> + <description>compares different machine learning models on a dataset using PyCaret. Do feature analyses using Random Forest and LightGBM. </description> + <macros> + <import>pycaret_macros.xml</import> + </macros> + <expand macro="python_requirements" /> + <command> + <![CDATA[ + python $__tool_directory__/pycaret_train.py --input_file $input_file --target_col $target_feature --output_dir "`pwd`" --random_seed $random_seed + #if $model_type == "classification" + #if $classification_models + --models $classification_models + #end if + #end if + #if $model_type == "regression" + #if $regression_models + --models $regression_models + #end if + #end if + #if $customize_defaults == "true" + #if $train_size + --train_size $train_size + #end if + #if $normalize + --normalize + #end if + #if $feature_selection + --feature_selection + #end if + #if $enable_cross_validation == "true" + --cross_validation + #end if + #if $cross_validation_folds + --cross_validation_folds $cross_validation_folds + #end if + #if $remove_outliers + --remove_outliers + #end if + #if $remove_multicollinearity + --remove_multicollinearity + #end if + #if $polynomial_features + --polynomial_features + #end if + #if $fix_imbalance + --fix_imbalance + #end if + #end if + #if $test_file + --test_file $test_file + #end if + --model_type $model_type + ]]> + </command> + <inputs> + <param name="input_file" type="data" format="csv,tabular" label="Train Dataset (CSV or TSV)" /> + <param name="test_file" type="data" format="csv,tabular" optional="true" label="Test Dataset (CSV or TSV)" + help="If a test set is not provided, + the selected training set will be split into training, validation, and test sets. + If a test set is provided, the training set will only be split into training and validation sets. + BTW, cross-validation is always applied by default." /> + <param name="target_feature" multiple="false" type="data_column" use_header_names="true" data_ref="input_file" label="Select the target column:" /> + <conditional name="model_selection"> + <param name="model_type" type="select" label="Task"> + <option value="classification">classification</option> + <option value="regression">regression</option> + </param> + <when value="classification"> + <param name="classification_models" type="select" multiple="true" label="Only Select Classification Models if you don't want to compare all models"> + <option value="lr">Logistic Regression</option> + <option value="knn">K Neighbors Classifier</option> + <option value="nb">Naive Bayes</option> + <option value="dt">Decision Tree Classifier</option> + <option value="svm">SVM - Linear Kernel</option> + <option value="rbfsvm">SVM - Radial Kernel</option> + <option value="gpc">Gaussian Process Classifier</option> + <option value="mlp">MLP Classifier</option> + <option value="ridge">Ridge Classifier</option> + <option value="rf">Random Forest Classifier</option> + <option value="qda">Quadratic Discriminant Analysis</option> + <option value="ada">Ada Boost Classifier</option> + <option value="gbc">Gradient Boosting Classifier</option> + <option value="lda">Linear Discriminant Analysis</option> + <option value="et">Extra Trees Classifier</option> + <option value="xgboost">Extreme Gradient Boosting</option> + <option value="lightgbm">Light Gradient Boosting Machine</option> + <option value="catboost">CatBoost Classifier</option> + </param> + </when> + <when value="regression"> + <param name="regression_models" type="select" multiple="true" label="Only Select Regression Models if you don't want to compare all models"> + <option value="lr">Linear Regression</option> + <option value="lasso">Lasso Regression</option> + <option value="ridge">Ridge Regression</option> + <option value="en">Elastic Net</option> + <option value="lar">Least Angle Regression</option> + <option value="llar">Lasso Least Angle Regression</option> + <option value="omp">Orthogonal Matching Pursuit</option> + <option value="br">Bayesian Ridge</option> + <option value="ard">Automatic Relevance Determination</option> + <option value="par">Passive Aggressive Regressor</option> + <option value="ransac">Random Sample Consensus</option> + <option value="tr">TheilSen Regressor</option> + <option value="huber">Huber Regressor</option> + <option value="kr">Kernel Ridge</option> + <option value="svm">Support Vector Regression</option> + <option value="knn">K Neighbors Regressor</option> + <option value="dt">Decision Tree Regressor</option> + <option value="rf">Random Forest Regressor</option> + <option value="et">Extra Trees Regressor</option> + <option value="ada">AdaBoost Regressor</option> + <option value="gbr">Gradient Boosting Regressor</option> + <option value="mlp">MLP Regressor</option> + <option value="xgboost">Extreme Gradient Boosting</option> + <option value="lightgbm">Light Gradient Boosting Machine</option> + <option value="catboost">CatBoost Regressor</option> + </param> + </when> + </conditional> + <param name="random_seed" type="integer" value="42" label="Random Seed" help="Random seed for reproducibility." /> + <conditional name="advanced_settings"> + <param name="customize_defaults" type="select" label="Customize Default Settings?" help="Select yes if you want to customize the default settings of the experiment."> + <option value="false" selected="true">No</option> + <option value="true">Yes</option> + </param> + <when value="true"> + <param name="train_size" type="float" value="0.7" min="0.1" max="0.9" label="Train Size" help="Proportion of the dataset to include in the train split." /> + <param name="normalize" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Normalize Data" help="Whether to normalize data before training." /> + <param name="feature_selection" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Feature Selection" help="Whether to perform feature selection." /> + <conditional name="cross_validation"> + <param name="enable_cross_validation" type="select" label="Enable Cross Validation?" help="Select whether to enable cross-validation. Default: Yes" > + <option value="false" >No</option> + <option value="true" selected="true">Yes</option> + </param> + <when value="true"> + <param name="cross_validation_folds" type="integer" value="10" min="2" max="20" label="Cross Validation Folds" help="Number of folds to use for cross-validation. Default: 10" /> + </when> + <when value="false"> + <!-- No additional parameters to show if the user selects 'No' --> + </when> + </conditional> + <param name="remove_outliers" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Remove Outliers" help="Whether to remove outliers from the dataset before training. Default: False" /> + <param name="remove_multicollinearity" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Remove Multicollinearity" help="Whether to remove multicollinear features before training. Default: False" /> + <param name="polynomial_features" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Polynomial Features" help="Whether to create polynomial features before training. Default: False" /> + <param name="fix_imbalance" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Fix Imbalance" help="ONLY for classfication! Whether to use SMOTE or similar methods to fix imbalance in the dataset. Default: False" /> + </when> + <when value="false"> + <!-- No additional parameters to show if the user selects 'No' --> + </when> + </conditional> + </inputs> + <outputs> + <data name="model" format="h5" from_work_dir="pycaret_model.h5" label="${tool.name} best model on ${on_string}" /> + <data name="comparison_result" format="html" from_work_dir="comparison_result.html" label="${tool.name} Comparison result on ${on_string}"/> + <data name="best_model_csv" format="csv" from_work_dir="best_model.csv" label="${tool.name} The prams of the best model on ${on_string}" hidden="true" /> + </outputs> + <tests> + <test> + <param name="input_file" value="pcr.tsv"/> + <param name="target_feature" value="11"/> + <param name="model_type" value="classification"/> + <param name="random_seed" value="42"/> + <param name="customize_defaults" value="true"/> + <param name="train_size" value="0.8"/> + <param name="normalize" value="true"/> + <param name="feature_selection" value="true"/> + <param name="enable_cross_validation" value="true"/> + <param name="cross_validation_folds" value="5"/> + <param name="remove_outliers" value="true"/> + <param name="remove_multicollinearity" value="true"/> + <output name="model" file="expected_model_classification_customized.h5" compare="sim_size"/> + <output name="comparison_result" file="expected_comparison_result_classification_customized.html" compare="sim_size" /> + <output name="best_model_csv" value="expected_best_model_classification_customized.csv" /> + </test> + <test> + <param name="input_file" value="pcr.tsv"/> + <param name="target_feature" value="11"/> + <param name="model_type" value="classification"/> + <param name="random_seed" value="42"/> + <output name="model" file="expected_model_classification.h5" compare="sim_size"/> + <output name="comparison_result" file="expected_comparison_result_classification.html" compare="sim_size" /> + <output name="best_model_csv" value="expected_best_model_classification.csv" /> + </test> + <test> + <param name="input_file" value="auto-mpg.tsv"/> + <param name="target_feature" value="1"/> + <param name="model_type" value="regression"/> + <param name="random_seed" value="42"/> + <output name="model" file="expected_model_regression.h5" compare="sim_size" /> + <output name="comparison_result" file="expected_comparison_result_regression.html" compare="sim_size" /> + <output name="best_model_csv" value="expected_best_model_regression.csv" /> + </test> + </tests> + <help> + This tool uses PyCaret to train and evaluate machine learning models. + It compares different models on a dataset and provides the best model based on the performance metrics. + + **Outputs** + + - **Model**: The best model trained on the dataset in h5 format. + + + - **Comparison Result**: The comparison result of different models in html format. + It contains the performance metrics of different models, plots of the best model + on the testing set (or part of the training set if a separate test set is not uploaded), and feature analysis plots. + + </help> + <expand macro="macro_citations" /> +</tool> \ No newline at end of file