diff pipeline.xml @ 25:118e230e85ce draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author bgruening
date Wed, 09 Aug 2023 13:40:06 +0000
parents 4de3d598c116
children
line wrap: on
line diff
--- a/pipeline.xml	Thu Aug 11 09:20:25 2022 +0000
+++ b/pipeline.xml	Wed Aug 09 13:40:06 2023 +0000
@@ -1,4 +1,4 @@
-<tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@" profile="20.05">
+<tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@" profile="@PROFILE@">
     <description>an all-in-one platform to build pipeline, single estimator, preprocessor and custom wrappers</description>
     <macros>
         <import>main_macros.xml</import>
@@ -18,7 +18,6 @@
 import imblearn
 import json
 import pandas as pd
-import pickle
 import pprint
 import skrebate
 import sys
@@ -30,11 +29,9 @@
     svm, linear_model, tree, discriminant_analysis)
 from sklearn.pipeline import make_pipeline
 from imblearn.pipeline import make_pipeline as imb_make_pipeline
+from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5
 from galaxy_ml.utils import (SafeEval, feature_selector, get_estimator,
-                             try_get_attr, get_search_params, load_model)
-
-## TODO remove following imports after scikit-learn v0.22
-from sklearn.experimental import enable_hist_gradient_boosting
+                             try_get_attr, get_search_params)
 
 
 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
@@ -182,10 +179,8 @@
     regressor_path = '$final_estimator.estimator_selector.regressor'
     transformer_path = '$final_estimator.estimator_selector.transformer'
     #end if
-    with open(regressor_path, 'rb') as f:
-        regressor = load_model(f)
-    with open(transformer_path, 'rb') as f:
-        transformer = load_model(f)
+    regressor = load_model_from_h5(regressor_path)
+    transformer = load_model_from_h5(transformer_path)
     estimator = compose.TransformedTargetRegressor(regressor=regressor, transformer=transformer)
     pipeline_steps.append( estimator )
 else:
@@ -202,14 +197,8 @@
         out_obj = make_pipeline(*pipeline_steps)
     pprint.pprint(out_obj.named_steps)
 
-with open('$outfile', 'wb') as out_handler:
-    pickle.dump(out_obj, out_handler, pickle.HIGHEST_PROTOCOL)
+dump_model_to_h5(out_obj, '$outfile', verbose=0)
 
-#if $get_params
-results = get_search_params(out_obj)
-df = pd.DataFrame(results, columns=['', 'Parameter', 'Value'])
-df.to_csv('$outfile_params', sep='\t', index=False)
-#end if
             ]]>
         </configfile>
     </configfiles>
@@ -254,7 +243,9 @@
                     <expand macro="imbalanced_learn_sampling" />
                 </when>
                 <when value="IRAPS">
-                    <expand macro="estimator_params_text" label="Type in parameter settings for IRAPSCore if different from default:" help="Default(=blank): n_iter=1000, responsive_thres=-1, resistant_thres=0, random_state=None. No double quotes" />
+                    <expand macro="estimator_params_text"
+                        label="Type in parameter settings for IRAPSCore if different from default:"
+                        help="Default(=blank): n_iter=1000, responsive_thres=-1, resistant_thres=0, random_state=None. No double quotes" />
                     <param argument="p_thres" type="float" value="0.001" label="P value threshold" help="Float. default=0.001" />
                     <param argument="fc_thres" type="float" value="0.1" label="fold change threshold" help="Float. default=0.1" />
                     <param argument="occurrence" type="float" value="0.7" label="reservation factor" help="Float. default=0.7" />
@@ -267,7 +258,7 @@
         </repeat>
         <section name="final_estimator" title="Final Estimator" expanded="true">
             <conditional name="estimator_selector">
-                <param name="selected_module" type="select" label="Choose the module that contains target estimator:">
+                <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >
                     <expand macro="estimator_module_options">
                         <option value="sklearn.compose">sklearn.compose</option>
                         <option value="binarize_target">Binarize Target Classifier or Regressor</option>
@@ -280,21 +271,21 @@
                         <param name="selected_estimator" type="select" label="Choose estimator class:">
                             <option value="TransformedTargetRegressor" selected="true">TransformedTargetRegressor</option>
                         </param>
-                        <param name="regressor" type="data" format="zip" label="Choose the dataset containing the wrapped regressor" />
-                        <param name="transformer" type="data" format="zip" label="Choose the dataset containing transformer" />
+                        <param name="regressor" type="data" format="h5mlm" label="Choose the dataset containing the wrapped regressor" />
+                        <param name="transformer" type="data" format="h5mlm" label="Choose the dataset containing transformer" />
                     </when>
                     <when value="binarize_target">
                         <param name="clf_or_regr" type="select" label="Classifier or Regressor:">
                             <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option>
                             <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option>
                         </param>
-                        <param name="wrapped_estimator" type="data" format="zip" label="Choose the dataset containing the wrapped estimator or pipeline" />
+                        <param name="wrapped_estimator" type="data" format="h5mlm" label="Choose the dataset containing the wrapped estimator or pipeline" />
                         <param name='z_score' type="float" value="-1" optional="false" label="Discrize target values using z_score" />
                         <param name='value' type="float" value="" optional="true" label="Discretize target values using a fixed value instead" help="Optional. default: None." />
                         <param name="less_is_positive" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Are the detecting values smaller than others?" />
                     </when>
                     <when value="custom_estimator">
-                        <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline" />
+                        <param name="c_estimator" type="data" format="h5mlm" label="Choose the dataset containing the custom estimator or pipeline" />
                     </when>
                     <when value="none" />
                 </expand>
@@ -304,13 +295,9 @@
             <option value="Pipeline_Builder" selected="true">Pipeline</option>
             <option value="Final_Estimator_Builder">Final Estimator</option>
         </param>-->
-        <param name="get_params" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Output parameters for searchCV?" help="Optional. Tunable parameters could be obtained through `estimator_attributes` tool." />
     </inputs>
     <outputs>
-        <data format="zip" name="outfile" label="New Pipleline/Estimator" />
-        <data format="tabular" name="outfile_params" label="get_params for Pipleline/Estimator">
-            <filter>get_params</filter>
-        </data>
+        <data format="h5mlm" name="outfile" label="New Pipleline/Estimator" />
     </outputs>
     <tests>
         <test>
@@ -328,7 +315,7 @@
                     <param name="selected_module" value="none" />
                 </conditional>
             </section>
-            <output name="outfile" file="pipeline17" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline17" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -340,12 +327,11 @@
             <section name="final_estimator">
                 <conditional name="estimator_selector">
                     <param name="selected_module" value="sklearn.compose" />
-                    <param name="regressor" value="RandomForestRegressor01.zip" ftype="zip" />
-                    <param name="transformer" value="pipeline17" ftype="zip" />
+                    <param name="regressor" value="RandomForestRegressor01.h5mlm" ftype="h5mlm" />
+                    <param name="transformer" value="pipeline17" ftype="h5mlm" />
                 </conditional>
             </section>
-            <param name="get_params" value="true" />
-            <output name="outfile_params" file="pipeline_params18" ftype="tabular" />
+            <output name="outfile" file="pipeline18" compare="sim_size" delta="5" />
         </test>
         <test>
             <repeat name="pipeline_component">
@@ -368,7 +354,7 @@
             <param name="selected_module" value="svm" />
             <param name="selected_estimator" value="SVR" />
             <param name="text_params" value="kernel='linear'" />
-            <output name="outfile" file="pipeline01" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline01" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -379,7 +365,7 @@
             </conditional>
             <param name="selected_module" value="linear_model" />
             <param name="selected_estimator" value="LassoCV" />
-            <output name="outfile" file="pipeline02" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline02" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -390,7 +376,7 @@
             </conditional>
             <param name="selected_module" value="xgboost" />
             <param name="selected_estimator" value="XGBClassifier" />
-            <output name="outfile" file="pipeline03" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline03" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -409,7 +395,7 @@
                 <param name="selected_module" value="svm" />
                 <param name="selected_estimator" value="LinearSVC" />
             </section>
-            <output name="outfile" file="pipeline04" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline04" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -418,56 +404,54 @@
             <param name="selected_module" value="ensemble" />
             <param name="selected_estimator" value="RandomForestRegressor" />
             <param name="text_params" value="n_estimators=100, random_state=42" />
-            <param name="get_params" value="true" />
-            <output name="outfile" file="pipeline05" compare="sim_size" delta="30" />
-            <output name="outfile_params" file="pipeline_params05.tabular" ftype="tabular" />
+            <output name="outfile" file="pipeline05" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
                 <param name="component_type" value="decomposition" />
-                <conditional name="matrix_decomposition_selector">
-                    <param name="select_algorithm" value="PCA" />
-                </conditional>
+                    <conditional name="matrix_decomposition_selector">
+                        <param name="select_algorithm" value="PCA" />
+                    </conditional>
             </conditional>
             <param name="selected_module" value="ensemble" />
             <param name="selected_estimator" value="AdaBoostRegressor" />
-            <output name="outfile" file="pipeline06" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline06" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
                 <param name="component_type" value="kernel_approximation" />
-                <conditional name="kernel_approximation_selector">
-                    <param name="select_algorithm" value="RBFSampler" />
-                    <param name="text_params" value="n_components=10, gamma=2.0" />
-                </conditional>
+                    <conditional name="kernel_approximation_selector">
+                        <param name="select_algorithm" value="RBFSampler" />
+                        <param name="text_params" value="n_components=10, gamma=2.0" />
+                    </conditional>
             </conditional>
             <param name="selected_module" value="ensemble" />
             <param name="selected_estimator" value="AdaBoostClassifier" />
-            <output name="outfile" file="pipeline07" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline07" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
                 <param name="component_type" value="FeatureAgglomeration" />
-                <conditional name="FeatureAgglomeration_selector">
-                    <param name="select_algorithm" value="FeatureAgglomeration" />
-                    <param name="text_params" value="n_clusters=3, affinity='euclidean'" />
-                </conditional>
+                    <conditional name="FeatureAgglomeration_selector">
+                        <param name="select_algorithm" value="FeatureAgglomeration" />
+                        <param name="text_params" value="n_clusters=3, affinity='euclidean'" />
+                    </conditional>
             </conditional>
             <param name="selected_module" value="ensemble" />
             <param name="selected_estimator" value="AdaBoostClassifier" />
-            <output name="outfile" file="pipeline08" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline08" compare="sim_size" delta="20" />
         </test>
         <test>
             <conditional name="component_selector">
                 <param name="component_type" value="skrebate" />
-                <conditional name="skrebate_selector">
-                    <param name="select_algorithm" value="ReliefF" />
-                    <param name="text_params" value="n_features_to_select=3, n_neighbors=100" />
-                </conditional>
+                    <conditional name="skrebate_selector">
+                        <param name="select_algorithm" value="ReliefF" />
+                        <param name="text_params" value="n_features_to_select=3, n_neighbors=100" />
+                    </conditional>
             </conditional>
             <param name="selected_module" value="ensemble" />
             <param name="selected_estimator" value="RandomForestRegressor" />
-            <output name="outfile" file="pipeline09" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline09" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -478,7 +462,7 @@
             </conditional>
             <param name="selected_module" value="ensemble" />
             <param name="selected_estimator" value="RandomForestClassifier" />
-            <output name="outfile" file="pipeline11" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline11" compare="sim_size" delta="5" />
         </test>
         <test expect_failure="true">
             <conditional name="component_selector">
@@ -505,7 +489,7 @@
                     <param name="selected_module" value="none" />
                 </conditional>
             </section>
-            <output name="outfile" file="pipeline12" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline12" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -513,7 +497,7 @@
             </conditional>
             <param name="selected_module" value="ensemble" />
             <param name="selected_estimator" value="RandomForestClassifier" />
-            <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="30" />
+            <output name="outfile" file="RandomForestClassifier.h5mlm" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -524,7 +508,7 @@
                     <param name="selected_module" value="none" />
                 </conditional>
             </section>
-            <output name="outfile" file="pipeline14" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline14" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -534,10 +518,10 @@
                 <conditional name="estimator_selector">
                     <param name="selected_module" value="binarize_target" />
                     <param name="clf_or_regr" value="BinarizeTargetClassifier" />
-                    <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip" />
+                    <param name="wrapped_estimator" value="RandomForestClassifier.h5mlm" ftype="h5mlm" />
                 </conditional>
             </section>
-            <output name="outfile" file="pipeline15" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline15" compare="sim_size" delta="5" />
         </test>
         <test>
             <conditional name="component_selector">
@@ -551,10 +535,10 @@
             <section name="final_estimator">
                 <conditional name="estimator_selector">
                     <param name="selected_module" value="custom_estimator" />
-                    <param name="c_estimator" value="keras_model02" ftype="zip" />
+                    <param name="c_estimator" value="keras_model02" ftype="h5mlm" />
                 </conditional>
             </section>
-            <output name="outfile" file="pipeline16" compare="sim_size" delta="30" />
+            <output name="outfile" file="pipeline16" compare="sim_size" delta="5" />
         </test>
     </tests>
     <help>
@@ -583,9 +567,9 @@
 
 **Output**
 
-- Pickled pipeline/estimator object
+- Pipeline/estimator object
 
-- Hyperparameter of the ojbect (optional)
+- Hyperparameter of the object (optional)
 
 
 .. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html