diff keras_train_and_eval.xml @ 10:b3093f953091 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author bgruening
date Wed, 09 Aug 2023 13:30:51 +0000
parents 3866911c93ae
children 818f9b69d8a0
line wrap: on
line diff
--- a/keras_train_and_eval.xml	Thu Aug 11 09:15:54 2022 +0000
+++ b/keras_train_and_eval.xml	Wed Aug 09 13:30:51 2023 +0000
@@ -1,4 +1,4 @@
-<tool id="keras_train_and_eval" name="Deep learning training and evaluation" version="@VERSION@" profile="20.05">
+<tool id="keras_train_and_eval" name="Deep learning training and evaluation" version="@VERSION@" profile="@PROFILE@">
     <description>conduct deep training and evaluation either implicitly or explicitly</description>
     <macros>
         <import>main_macros.xml</import>
@@ -19,27 +19,27 @@
             --inputs '$inputs'
             --estimator '$experiment_schemes.infile_estimator'
             #if $input_options.selected_input == 'seq_fasta'
-                --fasta_path '$input_options.fasta_path'
+            --fasta_path '$input_options.fasta_path'
             #elif $input_options.selected_input == 'refseq_and_interval'
-                --ref_seq "`pwd`/${ref_genome_file.element_identifier}"
-                --interval '$input_options.interval_file'
-                --targets "`pwd`/${target_file.element_identifier}.gz"
+            --ref_seq "`pwd`/${ref_genome_file.element_identifier}"
+            --interval '$input_options.interval_file'
+            --targets "`pwd`/${target_file.element_identifier}.gz"
             #else
-                --infile1 '$input_options.infile1'
+            --infile1 '$input_options.infile1'
             #end if
-                --infile2 '$input_options.infile2'
-                --outfile_result '$outfile_result'
+            --infile2 '$input_options.infile2'
+            --outfile_result '$outfile_result'
             #if $save and 'save_estimator' in str($save)
-                --outfile_object '$outfile_object'
-                --outfile_weights '$outfile_weights'
+            --outfile_object '$outfile_object'
             #end if
             #if $save and 'save_prediction' in str($save)
-                --outfile_y_true '$outfile_y_true'
-                --outfile_y_preds '$outfile_y_preds'
+            --outfile_y_true '$outfile_y_true'
+            --outfile_y_preds '$outfile_y_preds'
             #end if
             #if $experiment_schemes.test_split.split_algos.shuffle == 'group'
-                --groups '$experiment_schemes.test_split.split_algos.groups_selector.infile_g'
+            --groups '$experiment_schemes.test_split.split_algos.groups_selector.infile_g'
             #end if
+
         ]]>
     </command>
     <configfiles>
@@ -59,7 +59,7 @@
                     </expand>
                 </section>
                 <section name="metrics" title="Metrics for evaluation" expanded="false">
-                    <expand macro="scoring_selection" />
+                    <expand macro="scoring_selection" help="" />
                 </section>
             </when>
             <when value="train_val_test">
@@ -72,23 +72,20 @@
                 <section name="val_split" title="Validation holdout (recommend using the same splitting method as for test holdout)" expanded="false">
                     <expand macro="train_test_split_params" />
                 </section>
-                <section name="metrics" title="Metrics for evaluation" expanded="false">
-                    <expand macro="scoring_selection" />
+                <section name="metrics" title="Metrics from scikit-learn" expanded="false">
+                    <expand macro="scoring_selection" help="" />
                 </section>
             </when>
         </conditional>
         <expand macro="sl_mixed_input_plus_sequence" />
         <param name="save" type="select" multiple='true' display="checkboxes" label="Save the fitted model" optional="true" help="Evaluation scores will be output by default.">
-            <option value="save_estimator" selected="true">Fitted estimator in skeleton and weights, separately</option>
+            <option value="save_estimator" selected="true">Fitted estimator</option>
             <option value="save_prediction">True labels and prediction results from evaluation for downstream analysis</option>
         </param>
     </inputs>
     <outputs>
         <data format="tabular" name="outfile_result" />
-        <data format="zip" name="outfile_object" label="Fitted estimator or estimator skeleton on ${on_string}">
-            <filter>str(save) and 'save_estimator' in str(save)</filter>
-        </data>
-        <data format="h5" name="outfile_weights" label="Weights trained on ${on_string}">
+        <data format="h5mlm" name="outfile_object" label="Fitted estimator or estimator skeleton on ${on_string}">
             <filter>str(save) and 'save_estimator' in str(save)</filter>
         </data>
         <data format="tabular" name="outfile_y_true" label="True labels/target values on ${on_string}">
@@ -102,20 +99,19 @@
         <test>
             <conditional name="experiment_schemes">
                 <param name="selected_exp_scheme" value="train_val_test" />
-                <param name="infile_estimator" value="keras_model04" ftype="zip" />
+                <param name="infile_estimator" value="keras_model04" ftype="h5mlm" />
                 <section name="hyperparams_swapping">
-                    <param name="infile_params" value="keras_params04.tabular" ftype="tabular" />
                     <repeat name="param_set">
                         <param name="sp_value" value="999" />
-                        <param name="sp_name" value="layers_0_Dense__config__kernel_initializer__config__seed" />
+                        <param name="sp_name" value="layers_1_Dense__config__kernel_initializer__config__seed" />
                     </repeat>
                     <repeat name="param_set">
                         <param name="sp_value" value="999" />
-                        <param name="sp_name" value="layers_2_Dense__config__kernel_initializer__config__seed" />
+                        <param name="sp_name" value="layers_3_Dense__config__kernel_initializer__config__seed" />
                     </repeat>
                     <repeat name="param_set">
                         <param name="sp_value" value="0.1" />
-                        <param name="sp_name" value="lr" />
+                        <param name="sp_name" value="learning_rate" />
                     </repeat>
                     <repeat name="param_set">
                         <param name="sp_value" value="'adamax'" />
@@ -152,31 +148,29 @@
             <param name="save" value="save_estimator" />
             <output name="outfile_result">
                 <assert_contents>
-                    <has_n_columns n="2" />
-                    <has_text text="0.638" />
-                    <has_text text="-6.072" />
+                    <has_n_columns n="4" />
+                    <has_text text="0.794" />
+                    <has_text text="-4.62" />
                 </assert_contents>
             </output>
-            <output name="outfile_object" file="train_test_eval_model01" compare="sim_size" delta="50" />
-            <output name="outfile_weights" file="train_test_eval_weights01.h5" compare="sim_size" delta="50" />
+            <output name="outfile_object" file="train_test_eval_model01" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="experiment_schemes">
                 <param name="selected_exp_scheme" value="train_val_test" />
-                <param name="infile_estimator" value="keras_model04" ftype="zip" />
+                <param name="infile_estimator" value="keras_model04" ftype="h5mlm" />
                 <section name="hyperparams_swapping">
-                    <param name="infile_params" value="keras_params04.tabular" ftype="tabular" />
                     <repeat name="param_set">
                         <param name="sp_value" value="999" />
-                        <param name="sp_name" value="layers_0_Dense__config__kernel_initializer__config__seed" />
+                        <param name="sp_name" value="layers_1_Dense__config__kernel_initializer__config__seed" />
                     </repeat>
                     <repeat name="param_set">
                         <param name="sp_value" value="999" />
-                        <param name="sp_name" value="layers_2_Dense__config__kernel_initializer__config__seed" />
+                        <param name="sp_name" value="layers_3_Dense__config__kernel_initializer__config__seed" />
                     </repeat>
                     <repeat name="param_set">
                         <param name="sp_value" value="0.1" />
-                        <param name="sp_name" value="lr" />
+                        <param name="sp_name" value="learning_rate" />
                     </repeat>
                     <repeat name="param_set">
                         <param name="sp_value" value="'adamax'" />
@@ -217,22 +211,21 @@
             <param name="header2" value="true" />
             <param name="selected_column_selector_option2" value="all_columns" />
             <param name="save" value="save_estimator,save_prediction" />
-            <output name="outfile_result">
+            <output name="outfile_result" >
                 <assert_contents>
-                    <has_n_columns n="2" />
-                    <has_text text="0.627" />
-                    <has_text text="-6.012" />
+                    <has_n_columns n="4" />
+                    <has_text text="0.779" />
+                    <has_text text="-4.5" />
                 </assert_contents>
             </output>
-            <output name="outfile_weights" file="train_test_eval_weights02.h5" compare="sim_size" delta="50" />
+            <output name="outfile_object" file="train_test_eval_model02" compare="sim_size" delta="5" />
             <output name="outfile_y_true" file="keras_train_eval_y_true02.tabular" ftype="tabular" />
         </test>
         <test>
             <conditional name="experiment_schemes">
                 <param name="selected_exp_scheme" value="train_val" />
-                <param name="infile_estimator" value="pipeline10" ftype="zip" />
+                <param name="infile_estimator" value="pipeline10" ftype="h5mlm" />
                 <section name="hyperparams_swapping">
-                    <param name="infile_params" value="get_params10.tabular" ftype="tabular" />
                     <repeat name="param_set">
                         <param name="sp_value" value="10" />
                         <param name="sp_name" value="adaboostregressor__random_state" />
@@ -279,26 +272,28 @@
 
 Given a pre-built keras deep learning model and labeled training dataset, this tool works in two modes.
 
-- Train and Validate: training dataset is split into train and validation portions. The model fits on the train portion, in the meantime performances are validated on the validation portion multiple times along with the training progressing. Finally, a fitted model (skeleton + weights) and its validation performance scores are outputted. 
+- Train and Validate: the intput dataset is split into training and validation portions. The model is fitted on the training portion, in the meantime performances are evaluated on the validation portion multiple times while the training is progressing. Finally, a fitted model and its validation performance scores are outputted.
 
 
-- Train, Validate and and Evaluate: training dataset is split into three portions, train, val and test. The same `Train and Validate` happens on the train and val portions. The test portion is hold out exclusively for testing (evaluation). As a result, a fitted model (skeleton + weights) and test performance scores are outputted.
+- Train, Validate and and Evaluate: the input dataset is split into three portions, training, validation and testing. The same `Train and Validate` described above is performed on the training and validation portions. The testing portion is used exclusively for testing (evaluation). As a result, a fitted model and test performance scores are outputted.
+
+In both modes, besides the performance scores, the true labels and predicted values are outputted, which could be used in generating plots in other tools, machine learning visualization extensions, for example.
 
-In both modes, besides the performance scores, the true labels and predicted values are able to be ouputted, which could be used in generating plots in other tools, machine learning visualization extensions, for example.
+Note that since all training and model parameters are accessible and changeable in the `Hyperparameter Swapping` section, the training and evaluation processes are flexible and transparent.
 
-Note that since all training and model parameters are accessible and changeable in the `Hyperparameter Swapping` section, the training and evaluation processes are transparent and fully controllable.
+For metrics, there are two sets of metrics for deep learning training and evaluation, one from the keras model builder and the other from scikit-learn. Keras metrics, if selected, are always evaluated, while the sklearn metrics could be ignored when `default` is the selection. Please be aware that not every sklearn metric works with deep learning model at current moment. Feel free to file a ticket if an issue is found and contibuting with PRs is always welcomed.
 
 **Input**
 
 - tabular
 - sparse
-- `sequnences in a fasta file` to work with DNA, RNA and Proteins with corresponding fasta data generator
+- `sequences in a fasta file` to work with DNA, RNA and proteins with corresponding fasta data generator
 - `reference genome and intervals` exclusively work with `GenomicIntervalBatchGenerator`.
 
 **Output**
 
 - performance scores from evaluation
-- fitted estimator skeleton and weights
+- fitted estimator
 - true labels or values and predicted values from the evaluation
 
         ]]>