Mercurial > repos > bgruening > sklearn_build_pipeline

diff pipeline.xml @ 7:99038af8deda draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57f4407e278a615f47a377a3328782b1d8e0b54d
author: bgruening
date: Sun, 30 Dec 2018 01:52:15 -0500
parents: 52c4e0ef580a
children: 913ee94945f3
--- a/pipeline.xml	Thu Oct 11 03:30:40 2018 -0400
+++ b/pipeline.xml	Sun Dec 30 01:52:15 2018 -0500
@@ -5,6 +5,7 @@
     </macros>
     <expand macro="python_requirements">
         <requirement type="package" version="0.6">skrebate</requirement>
+        <requirement type="package" version="0.4.2">imbalanced-learn</requirement>
     </expand>
     <expand macro="macro_stdio"/>
     <version_command>echo "@VERSION@"</version_command>
@@ -17,34 +18,45 @@
         <inputs name="inputs" />
         <configfile name="sklearn_pipeline_script">
             <![CDATA[
-import sys
-import os
 import json
 import pprint
 import skrebate
+import imblearn
+from imblearn import under_sampling, over_sampling, combine
+from imblearn.pipeline import Pipeline as imbPipeline
 from sklearn import (preprocessing, svm, linear_model, ensemble, naive_bayes,
                     tree, neighbors, decomposition, kernel_approximation, cluster)
 from sklearn.pipeline import Pipeline
 
-exec(open("$__tool_directory__/utils.py").read(), globals())
+with open('$__tool_directory__/sk_whitelist.json', 'r') as f:
+    sk_whitelist = json.load(f)
+exec(open('$__tool_directory__/utils.py').read(), globals())
+
+warnings.filterwarnings('ignore')
 
 safe_eval = SafeEval()
 
 input_json_path = sys.argv[1]
-with open(input_json_path, "r") as param_handler:
+with open(input_json_path, 'r') as param_handler:
     params = json.load(param_handler)
 
+#if $final_estimator.estimator_selector.selected_module == 'customer_estimator':
+params['final_estimator']['estimator_selector']['c_estimator'] =\
+        '$final_estimator.estimator_selector.c_estimator'
+#end if
+
 pipeline_steps = []
 
 def get_component(input_json, check_none=False):
+    is_imblearn = False
     if input_json['component_type'] == 'None':
         if not check_none:
-            return
+            return None, False
         else:
             sys.exit("The pre-processing component type can't be None when the number of components is greater than 1.")
     if input_json['component_type'] == 'pre_processor':
-        preprocessor = input_json["pre_processors"]["selected_pre_processor"]
-        pre_processor_options = input_json["pre_processors"]["options"]
+        preprocessor = input_json['pre_processors']['selected_pre_processor']
+        pre_processor_options = input_json['pre_processors']['options']
         my_class = getattr(preprocessing, preprocessor)
         obj = my_class(**pre_processor_options)
     elif input_json['component_type'] == 'feature_selection':
@@ -53,22 +65,22 @@
         algorithm = input_json['matrix_decomposition_selector']['select_algorithm']
         obj = getattr(decomposition, algorithm)()
         options = input_json['matrix_decomposition_selector']['text_params'].strip()
-        if options != "":
-            options = safe_eval('dict(' + options + ')')
+        if options != '':
+            options = safe_eval( 'dict(' + options + ')' )
             obj.set_params(**options)
     elif input_json['component_type'] == 'kernel_approximation':
         algorithm = input_json['kernel_approximation_selector']['select_algorithm']
         obj = getattr(kernel_approximation, algorithm)()
         options = input_json['kernel_approximation_selector']['text_params'].strip()
-        if options != "":
-            options = safe_eval('dict(' + options + ')')
+        if options != '':
+            options = safe_eval( 'dict(' + options + ')' )
             obj.set_params(**options)
     elif input_json['component_type'] == 'FeatureAgglomeration':
         algorithm = input_json['FeatureAgglomeration_selector']['select_algorithm']
         obj = getattr(cluster, algorithm)()
         options = input_json['FeatureAgglomeration_selector']['text_params'].strip()
-        if options != "":
-            options = safe_eval('dict(' + options + ')')
+        if options != '':
+            options = safe_eval( 'dict(' + options + ')' )
             obj.set_params(**options)
     elif input_json['component_type'] == 'skrebate':
         algorithm = input_json['skrebate_selector']['select_algorithm']
@@ -77,32 +89,58 @@
         else:
             obj = getattr(skrebate, algorithm)()
         options = input_json['skrebate_selector']['text_params'].strip()
-        if options != "":
-            options = safe_eval('dict(' + options + ')')
+        if options != '':
+            options = safe_eval( 'dict(' + options + ')' )
+            obj.set_params(**options)
+    elif input_json['component_type'] == 'imblearn':
+        is_imblearn = True
+        algorithm = input_json['imblearn_selector']['select_algorithm']
+        if algorithm == 'over_sampling.SMOTENC':
+            obj = over_sampling.SMOTENC(categorical_features=[])
+        else:
+            globals = algorithm.split('.')
+            mod, klass = globals[0], globals[1]
+            obj = getattr(getattr(imblearn, mod), klass)()
+        options = input_json['imblearn_selector']['text_params'].strip()
+        if options != '':
+            options = safe_eval( 'dict(' + options + ')' )
             obj.set_params(**options)
     if 'n_jobs' in obj.get_params():
         obj.set_params( n_jobs=N_JOBS )
-    return obj
+    return obj, is_imblearn
 
+has_imblearn = False
 if len(params['pipeline_component']) == 1:
-    step_obj = get_component( params['pipeline_component'][0]['component_selector'])
+    step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector'])
     if step_obj:
         pipeline_steps.append( ('preprocessing_1', step_obj) )
+        if is_imblearn:
+            has_imblearn = True
 else:
     for i, c in enumerate(params['pipeline_component']):
-        step_obj = get_component( c['component_selector'], check_none=True )
+        step_obj, is_imblearn = get_component( c['component_selector'], check_none=True )
         pipeline_steps.append( ('preprocessing_' + str(i+1), step_obj) )
+        if is_imblearn:
+            has_imblearn = True
 
 # Set up final estimator and add to pipeline.
-estimator_json = params["final_estimator"]['estimator_selector']
-estimator = get_estimator(estimator_json)
+estimator_json = params['final_estimator']['estimator_selector']
+if estimator_json['selected_module'] == 'none':
+    if len(pipeline_steps) == 0:
+        sys.exit("No pipeline steps specified!")
+    else:   # turn the last pre-process component to final estimator
+        pipeline_steps[-1] = ('estimator', pipeline_steps[-1][-1])
+else:
+    estimator = get_estimator(estimator_json)
+    pipeline_steps.append( ('estimator', estimator) )
 
-pipeline_steps.append( ('estimator', estimator) )
-
-pipeline = Pipeline(pipeline_steps)
+if has_imblearn:
+    pipeline = imbPipeline(pipeline_steps)
+else:
+    pipeline = Pipeline(pipeline_steps)
 pprint.pprint(pipeline.named_steps)
 
-with open("$outfile", 'wb') as out_handler:
+with open('$outfile', 'wb') as out_handler:
     pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL)
 
             ]]>
@@ -118,7 +156,8 @@
                     <option value="decomposition">Matrix Decomposition</option>
                     <option value="kernel_approximation">Kernel Approximation</option>
                     <option value="FeatureAgglomeration">Agglomerate Features</option>
-                    <option value="skrebate">Skrebate algorithm</option>
+                    <option value="skrebate">SK-rebate feature selection</option>
+                    <option value="imblearn">imbalanced-learn sampling</option>
                 </param>
                 <when value="None"/>
                 <when value="pre_processor">
@@ -128,9 +167,7 @@
                     </conditional>
                 </when>
                 <when value="feature_selection">
-                    <expand macro="feature_selection_all">
-                        <expand macro="fs_selectfrommodel_no_prefitted"/>
-                    </expand>
+                    <expand macro="feature_selection_pipeline"/>
                 </when>
                 <when value="decomposition">
                     <expand macro="matrix_decomposition_all"/>
@@ -144,10 +181,26 @@
                 <when value="skrebate">
                     <expand macro="skrebate"/>
                 </when>
+                <when value="imblearn">
+                    <expand macro="imbalanced_learn_sampling"/>
+                </when>
             </conditional>
         </repeat>
         <section name="final_estimator" title="Final Estimator" expanded="true">
-            <expand macro="estimator_selector_all" />
+            <conditional name="estimator_selector">
+                <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >
+                    <expand macro="estimator_module_options">
+                        <option value="customer_estimator">Load a customer estimator</option>
+                        <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option>
+                    </expand>
+                </param>
+                <expand macro="estimator_suboptions">
+                    <when value="customer_estimator">
+                        <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the customer estimator or pipeline:"/>
+                    </when>
+                    <when value="none"/>
+                </expand>
+            </conditional>
         </section>
     </inputs>
     <outputs>
@@ -175,7 +228,7 @@
             <param name="selected_module" value="svm"/>
             <param name="selected_estimator" value="SVR"/>
             <param name="text_params" value="kernel='linear'"/>
-            <output name="outfile" file="pipeline01" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline01" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -186,7 +239,7 @@
             </conditional>
             <param name="selected_module" value="linear_model"/>
             <param name="selected_estimator" value="LassoCV"/>
-            <output name="outfile" file="pipeline02" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline02" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -197,7 +250,7 @@
             </conditional>
             <param name="selected_module" value="xgboost"/>
             <param name="selected_estimator" value="XGBClassifier"/>
-            <output name="outfile" file="pipeline03" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline03" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -216,7 +269,7 @@
                 <param name="selected_module" value="svm"/>
                 <param name="selected_estimator" value="LinearSVC"/>
             </section>
-            <output name="outfile" file="pipeline04" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline04" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -225,7 +278,7 @@
             <param name="selected_module" value="ensemble"/>
             <param name="selected_estimator" value="RandomForestRegressor"/>
             <param name="text_params" value="n_estimators=100, random_state=42"/>
-            <output name="outfile" file="pipeline05" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline05" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -236,7 +289,7 @@
             </conditional>
             <param name="selected_module" value="ensemble"/>
             <param name="selected_estimator" value="AdaBoostRegressor"/>
-            <output name="outfile" file="pipeline06" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline06" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -248,7 +301,7 @@
             </conditional>
             <param name="selected_module" value="ensemble"/>
             <param name="selected_estimator" value="AdaBoostClassifier"/>
-            <output name="outfile" file="pipeline07" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline07" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -260,7 +313,7 @@
             </conditional>
             <param name="selected_module" value="ensemble"/>
             <param name="selected_estimator" value="AdaBoostClassifier"/>
-            <output name="outfile" file="pipeline08" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline08" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -272,19 +325,26 @@
             </conditional>
             <param name="selected_module" value="ensemble"/>
             <param name="selected_estimator" value="RandomForestRegressor"/>
-            <output name="outfile" file="pipeline09" compare="sim_size" delta="1"/>
+            <output name="outfile" file="pipeline09" compare="sim_size" delta="5"/>
         </test>
         <test>
             <conditional name="component_selector">
-                <param name="component_type" value="skrebate"/>
-                    <conditional name="skrebate_selector">
-                        <param name="select_algorithm" value="TuRF"/>
-                        <param name="text_params" value=""/>
-                    </conditional>
+                <param name="component_type" value="None"/>
             </conditional>
             <param name="selected_module" value="ensemble"/>
-            <param name="selected_estimator" value="RandomForestRegressor"/>
-            <output name="outfile" file="pipeline10" compare="sim_size" delta="1"/>
+            <param name="selected_estimator" value="AdaBoostRegressor"/>
+            <output name="outfile" file="pipeline10" compare="sim_size" delta="5"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="imblearn"/>
+                <conditional name="imblearn_selector">
+                    <param name="select_algorithm" value="under_sampling.EditedNearestNeighbours"/>
+                </conditional>
+            </conditional>
+            <param name="selected_module" value="ensemble"/>
+            <param name="selected_estimator" value="RandomForestClassifier"/>
+            <output name="outfile" file="pipeline11" compare="sim_size" delta="5"/>
         </test>
         <test expect_failure="true">
             <conditional name="component_selector">
@@ -294,6 +354,25 @@
             <param name="selected_estimator" value="RandomForestRegressor"/>
             <param name="text_params" value="n_estimators=__import__('os').system('ls ~')"/>
         </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="feature_selection"/>
+                <conditional name="fs_algorithm_selector">
+                    <param name="selected_algorithm" value="RFE"/>
+                    <conditional name="estimator_selector">
+                        <param name="selected_module" value="xgboost"/>
+                        <param name="selected_estimator" value="XGBRegressor"/>
+                        <param name="text_params" value="random_state=0"/>
+                    </conditional>
+                </conditional>
+            </conditional>
+            <section name="final_estimator">
+                <conditional name="estimator_selector">
+                    <param name="selected_module" value="none"/>
+                </conditional>
+            </section>
+            <output name="outfile" file="pipeline12" compare="sim_size" delta="5"/>
+        </test>
     </tests>
     <help>
         <![CDATA[
@@ -328,5 +407,6 @@
     <expand macro="sklearn_citation">
         <expand macro="skrebate_citation"/>
         <expand macro="xgboost_citation"/>
+        <expand macro="imblearn_citation"/>
     </expand>
 </tool>
author	bgruening
date	Sun, 30 Dec 2018 01:52:15 -0500
parents	52c4e0ef580a
children	913ee94945f3