Mercurial > repos > goeckslab > multimodal_learner
diff multimodal_learner.xml @ 0:375c36923da1 draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
| author | goeckslab |
|---|---|
| date | Tue, 09 Dec 2025 23:49:47 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multimodal_learner.xml Tue Dec 09 23:49:47 2025 +0000 @@ -0,0 +1,316 @@ +<tool id="multimodal_learner" name="Multimodal Learner" version="0.1.0" profile="22.01"> + <description>Train and evaluate an AutoGluon Multimodal model (tabular + image + text)</description> + + <requirements> + <container type='docker'>quay.io/goeckslab/multimodal-learner:1.4.0</container> + </requirements> + + <required_files> + <include path="multimodal_learner.py"/> + <include path="utils.py"/> + <include path="split_logic.py"/> + <include path="training_pipeline.py"/> + <include path="test_pipeline.py"/> + <include path="metrics_logic.py"/> + <include path="plot_logic.py"/> + <include path="report_utils.py"/> + <include path="feature_help_modal.py"/> + </required_files> + + <stdio> + <exit_code range="137" level="fatal_oom" description="Out of Memory"/> + <exit_code range="1:" level="fatal" description="Tool failed — see Tool Standard Error"/> + </stdio> + + <command detect_errors="exit_code"><![CDATA[ +#import re + +#set $image_zip_paths = [] +#if $use_images_conditional.use_images == "yes" + #for $zip_file in $use_images_conditional.images_zip_repeat + #set $image_zip_paths = $image_zip_paths + [$zip_file.images_zip] + #end for +#end if +#if len($image_zip_paths) > 0 + #set $images_zip_cli = " ".join(["'%s'" % z for z in $image_zip_paths]) +#else + #set $images_zip_cli = None +#end if + +set -e; +ln -sf '$input_csv' 'train_input.csv'; +#if $test_dataset_conditional.has_test_dataset == "yes" +ln -sf '$test_dataset_conditional.input_test' 'test_input.csv'; +#end if + +python '$__tool_directory__/multimodal_learner.py' + --input_csv_train 'train_input.csv' + #if $test_dataset_conditional.has_test_dataset == "yes" + --input_csv_test 'test_input.csv' + #end if + --target_column '$target_column' + + #if $use_images_conditional.use_images == "yes" + #if $images_zip_cli + --images_zip $images_zip_cli + #end if + --missing_image_strategy '$use_images_conditional.missing_image_strategy' + #if $use_images_conditional.backbone_image + --backbone_image '$use_images_conditional.backbone_image' + #end if + #end if + + #if $backbone_text not in ("", None) + --backbone_text '$backbone_text' + #end if + + --preset '$preset' + --eval_metric '$eval_metric' + + --random_seed '$random_seed' + #if $time_limit + --time_limit $time_limit + #end if + #if $deterministic == "true" + --deterministic + #end if + + #if $customize_defaults_conditional.customize_defaults == "yes" + #if $customize_defaults_conditional.validation_size not in ("", None) + --validation_size $customize_defaults_conditional.validation_size + #end if + #if $customize_defaults_conditional.split_probabilities and str($customize_defaults_conditional.split_probabilities).strip() + --split_probabilities #echo " ".join([str(float(x)) for x in str($customize_defaults_conditional.split_probabilities).replace(",", " ").split() if x.strip()]) # + #end if + #if $customize_defaults_conditional.cross_validation == "true" + --cross_validation true + --num_folds $customize_defaults_conditional.num_folds + #end if + #if $customize_defaults_conditional.epochs + --epochs $customize_defaults_conditional.epochs + #end if + #if $customize_defaults_conditional.learning_rate + --learning_rate $customize_defaults_conditional.learning_rate + #end if + #if $customize_defaults_conditional.batch_size + --batch_size $customize_defaults_conditional.batch_size + #end if + #if $customize_defaults_conditional.threshold + --threshold $customize_defaults_conditional.threshold + #end if + #if $customize_defaults_conditional.hyperparameters + --hyperparameters '$customize_defaults_conditional.hyperparameters' + #end if + #end if + + --output_json '$output_json' + --output_html '$output_html' + --output_config '$output_config' +]]></command> + + <inputs> + <param name="input_csv" type="data" format="csv,tsv" label="Training dataset (CSV/TSV)" help="Must contain the target column and optional image paths"/> + <param name="target_column" type="data_column" data_ref="input_csv" numerical="false" use_header_names="true" label="Target / Label column"/> + + <conditional name="test_dataset_conditional"> + <param name="has_test_dataset" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Provide separate test dataset?"/> + <when value="yes"> + <param name="input_test" type="data" format="csv,tsv" optional="true" label="Test dataset (CSV/TSV)"/> + </when> + <when value="no"/> + </conditional> + + <param name="backbone_text" type="select" label="Text backbone" optional="true"> + <option value="microsoft/deberta-v3-base" selected="true">microsoft/deberta-v3-base</option> + <option value="microsoft/deberta-v3-small">microsoft/deberta-v3-small</option> + <option value="google/electra-base-discriminator">google/electra-base-discriminator</option> + <option value="google/electra-small-discriminator">google/electra-small-discriminator</option> + <option value="roberta-base">roberta-base</option> + <option value="bert-base-uncased">bert-base-uncased</option> + <option value="distilroberta-base">distilroberta-base</option> + <option value="albert-base-v2">albert-base-v2</option> + </param> + + <conditional name="use_images_conditional"> + <param name="use_images" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Use image modality?"/> + <when value="yes"> + <repeat name="images_zip_repeat" title="Image archive(s)" min="1"> + <param name="images_zip" type="data" format="zip" label="ZIP file containing images"/> + </repeat> + <param name="backbone_image" type="select" label="Image backbone" optional="true"> + <option value='swin_base_patch4_window7_224' selected='true'>swin_base_patch4_window7_224</option> + <option value='swin_large_patch4_window12_384.in22k_ft_in1k'>swin_large_patch4_window12_384.in22k_ft_in1k</option> + <option value='swin_small_patch4_window7_224'>swin_small_patch4_window7_224</option> + <option value='swin_tiny_patch4_window7_224'>swin_tiny_patch4_window7_224</option> + <option value='caformer_b36.in21k_ft_in1k'>caformer_b36.in21k_ft_in1k</option> + <option value='caformer_m36.in21k_ft_in1k'>caformer_m36.in21k_ft_in1k</option> + <option value='caformer_s36.in21k_ft_in1k'>caformer_s36.in21k_ft_in1k</option> + <option value='caformer_s18.in1k'>caformer_s18.in1k</option> + <option value='caformer_b36.sail_in22k_ft_in1k'>caformer_b36.sail_in22k_ft_in1k</option> + <option value='caformer_m36.sail_in22k_ft_in1k'>caformer_m36.sail_in22k_ft_in1k</option> + <option value='caformer_s36.sail_in22k_ft_in1k'>caformer_s36.sail_in22k_ft_in1k</option> + <option value='vit_base_patch16_224'>vit_base_patch16_224</option> + <option value='vit_large_patch14_224'>vit_large_patch14_224</option> + <option value='convnext_base'>convnext_base</option> + <option value='eva02_base_patch14_448.mim_in22k_ft_in22k_in1k'>eva02_base_patch14_448.mim_in22k_ft_in22k_in1k</option> + <option value='resnet50'>resnet50</option> + </param> + <param name="missing_image_strategy" type="boolean" truevalue="true" falsevalue="false" checked="false" + label="Drop rows with missing images?" help="True = drop, False = replace with placeholder (default)"/> + </when> + <when value="no"/> + </conditional> + + <param name="preset" type="select" label="Quality preset"> + <option value="medium_quality" selected="true">Medium quality (fast)</option> + <option value="high_quality">High quality</option> + <option value="best_quality">Best quality (slowest)</option> + </param> + + <param name="eval_metric" type="select" label="Primary evaluation metric"> + <option value="auto" selected="true">Auto (let AutoGluon choose)</option> + <option value="roc_auc">ROC AUC</option> + <option value="accuracy">Accuracy</option> + <option value="balanced_accuracy">Balanced Accuracy</option> + <option value="f1">F1</option> + <option value="f1_macro">F1 Macro</option> + <option value="f1_micro">F1 Micro</option> + <option value="f1_weighted">F1 Weighted</option> + <option value="precision">Precision</option> + <option value="precision_macro">Precision Macro</option> + <option value="precision_micro">Precision Micro</option> + <option value="precision_weighted">Precision Weighted</option> + <option value="recall">Recall</option> + <option value="recall_macro">Recall Macro</option> + <option value="recall_micro">Recall Micro</option> + <option value="recall_weighted">Recall Weighted</option> + <option value="average_precision">Average Precision</option> + <option value="roc_auc_ovo_macro">ROC AUC OVO Macro</option> + <option value="roc_auc_ovo_weighted">ROC AUC OVO Weighted</option> + <option value="roc_auc_ovr_macro">ROC AUC OVR Macro</option> + <option value="roc_auc_ovr_weighted">ROC AUC OVR Weighted</option> + <option value="log_loss">Log Loss</option> + <option value="mse">MSE</option> + <option value="rmse">RMSE</option> + <option value="mae">MAE</option> + <option value="msle">MSLE</option> + <option value="r2">R2</option> + </param> + + <param name="random_seed" type="integer" value="42" label="Random seed"/> + + <param name="time_limit" type="integer" optional="true" min="60" label="Time limit (seconds)" help="Total training time budget. Recommended: 3600+ for real runs"/> + <param name="deterministic" type="boolean" truevalue="true" falsevalue="false" checked="false" + label="Enable deterministic mode" help="Use deterministic algorithms and CuDNN settings to reduce run-to-run variance (may slow training)"/> + + <conditional name="customize_defaults_conditional"> + <param name="customize_defaults" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Advanced: customize training settings?"/> + <when value="yes"> + <param name="validation_size" type="float" value="0.2" label="Validation fraction (when test set provided)"/> + <param name="split_probabilities" type="text" value="0.7 0.1 0.2" label="Train / Val / Test split (space-separated) when no test set"/> + <param name="cross_validation" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Enable k-fold cross-validation"/> + <param name="num_folds" type="integer" value="5" label="Number of CV folds"/> + <param name="epochs" type="integer" optional="true" label="Max epochs"/> + <param name="learning_rate" type="float" optional="true" label="Learning rate"/> + <param name="batch_size" type="integer" optional="true" label="Batch size"/> + <param name="threshold" type="float" optional="true" min="0" max="1" label="Binary classification threshold"/> + <param name="hyperparameters" type="text" optional="true" label="Extra AutoGluon hyperparameters (JSON or YAML string)"/> + </when> + <when value="no"/> + </conditional> + </inputs> + + <outputs> + <data name="output_html" format="html" label="Multimodal Learner analysis report on data ${input_csv.name}"/> + <data name="output_config" format="yaml" label="Multimodal Learner training config on data ${input_csv.name}"/> + <data name="output_json" format="json" label="Multimodal Learner metric results on data ${input_csv.name}"/> + </outputs> + + <tests> + <!-- Basic run with images + external test set --> + <test expect_num_outputs="3"> + <param name="input_csv" value="train.csv"/> + <param name="target_column" value="7"/> + <param name="test_dataset_conditional|has_test_dataset" value="yes"/> + <param name="test_dataset_conditional|input_test" value="test.csv"/> + <param name="use_images_conditional|use_images" value="yes"/> + <param name="use_images_conditional|images_zip_repeat_0|images_zip" value="images.zip"/> + <param name="use_images_conditional|backbone_image" value="resnet50"/> + <param name="backbone_text" value="google/electra-base-discriminator"/> + <output name="output_html"> + <assert_contents> + <has_text text="Model Performance Summary"/> + </assert_contents> + </output> + </test> + + <!-- Custom threshold --> + <test expect_num_outputs="3"> + <param name="input_csv" value="train.csv"/> + <param name="target_column" value="7"/> + <param name="test_dataset_conditional|has_test_dataset" value="yes"/> + <param name="test_dataset_conditional|input_test" value="test.csv"/> + <param name="use_images_conditional|use_images" value="yes"/> + <param name="use_images_conditional|images_zip_repeat_0|images_zip" value="images.zip"/> + <param name="customize_defaults_conditional|customize_defaults" value="yes"/> + <param name="customize_defaults_conditional|threshold" value="0.4"/> + <output name="output_json"> + <assert_contents> + <has_text text=""threshold": 0.4"/> + </assert_contents> + </output> + </test> + + <!-- No external test set; internal split --> + <test expect_num_outputs="3"> + <param name="input_csv" value="train.csv"/> + <param name="target_column" value="7"/> + <param name="test_dataset_conditional|has_test_dataset" value="no"/> + <param name="use_images_conditional|use_images" value="yes"/> + <param name="use_images_conditional|images_zip_repeat_0|images_zip" value="images.zip"/> + <output name="output_json"> + <assert_contents> + <has_text text=""val""/> + </assert_contents> + </output> + </test> + + <!-- Text/tabular only (ignore images) --> + <test expect_num_outputs="3"> + <param name="input_csv" value="train.csv"/> + <param name="target_column" value="7"/> + <param name="test_dataset_conditional|has_test_dataset" value="yes"/> + <param name="test_dataset_conditional|input_test" value="test.csv"/> + <param name="use_images_conditional|use_images" value="no"/> + <output name="output_html"> + <assert_contents> + <has_text text="Train and Validation Performance Summary"/> + </assert_contents> + </output> + </test> + </tests> + + <help><![CDATA[ +**AutoGluon Multimodal Learner** + +Trains a powerful multimodal model combining tabular features, images, and text using AutoGluon-Multimodal. + +- Handles missing images intelligently +- Supports cross-validation +- Produces detailed HTML reports and transparent metrics +- Fully reproducible + +Ideal for medical imaging + clinical data, product images + descriptions, etc. +]]></help> + + <citations> + <citation type="bibtex"> +@article{AutoGluon2023, + author = {Erickson, Nick and Mueller, Jonas and Wang, Yizhou and others}, + title = {AutoGluon-Tabular: Robust and Accurate AutoML for Structured Data}, + journal = {arXiv preprint arXiv:2003.06505}, + year = {2023} +} + </citation> + </citations> +</tool>
