Mercurial > repos > bgruening > sklearn_build_pipeline

diff pipeline.xml @ 8:913ee94945f3 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author: bgruening
date: Tue, 14 May 2019 18:06:37 -0400
parents: 99038af8deda
children: 775b004b7920
--- a/pipeline.xml	Sun Dec 30 01:52:15 2018 -0500
+++ b/pipeline.xml	Tue May 14 18:06:37 2019 -0400
@@ -3,10 +3,7 @@
     <macros>
         <import>main_macros.xml</import>
     </macros>
-    <expand macro="python_requirements">
-        <requirement type="package" version="0.6">skrebate</requirement>
-        <requirement type="package" version="0.4.2">imbalanced-learn</requirement>
-    </expand>
+    <expand macro="python_requirements"/>
     <expand macro="macro_stdio"/>
     <version_command>echo "@VERSION@"</version_command>
     <command>
@@ -18,19 +15,28 @@
         <inputs name="inputs" />
         <configfile name="sklearn_pipeline_script">
             <![CDATA[
+import imblearn
 import json
+import pickle
 import pprint
 import skrebate
-import imblearn
-from imblearn import under_sampling, over_sampling, combine
-from imblearn.pipeline import Pipeline as imbPipeline
-from sklearn import (preprocessing, svm, linear_model, ensemble, naive_bayes,
-                    tree, neighbors, decomposition, kernel_approximation, cluster)
-from sklearn.pipeline import Pipeline
+import sys
+import warnings
+from mlxtend import classifier, regressor
+from sklearn import (
+    cluster, compose, decomposition, ensemble, feature_extraction,
+    feature_selection, gaussian_process, kernel_approximation, metrics,
+    model_selection, naive_bayes, neighbors, pipeline, preprocessing,
+    svm, linear_model, tree, discriminant_analysis)
+from sklearn.pipeline import make_pipeline
+from imblearn.pipeline import make_pipeline as imb_make_pipeline
 
-with open('$__tool_directory__/sk_whitelist.json', 'r') as f:
-    sk_whitelist = json.load(f)
-exec(open('$__tool_directory__/utils.py').read(), globals())
+sys.path.insert(0, '$__tool_directory__')
+
+from utils import SafeEval, feature_selector, get_estimator, try_get_attr
+from preprocessors import Z_RandomOverSampler
+
+N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
 
 warnings.filterwarnings('ignore')
 
@@ -40,11 +46,16 @@
 with open(input_json_path, 'r') as param_handler:
     params = json.load(param_handler)
 
-#if $final_estimator.estimator_selector.selected_module == 'customer_estimator':
+#if $final_estimator.estimator_selector.selected_module == 'custom_estimator':
 params['final_estimator']['estimator_selector']['c_estimator'] =\
         '$final_estimator.estimator_selector.c_estimator'
 #end if
 
+#if $final_estimator.estimator_selector.selected_module == 'binarize_target':
+params['final_estimator']['estimator_selector']['wrapped_estimator'] =\
+        '$final_estimator.estimator_selector.wrapped_estimator'
+#end if
+
 pipeline_steps = []
 
 def get_component(input_json, check_none=False):
@@ -53,7 +64,8 @@
         if not check_none:
             return None, False
         else:
-            sys.exit("The pre-processing component type can't be None when the number of components is greater than 1.")
+            sys.exit("The pre-processing component type can't be None "
+                     "when the number of components is greater than 1.")
     if input_json['component_type'] == 'pre_processor':
         preprocessor = input_json['pre_processors']['selected_pre_processor']
         pre_processor_options = input_json['pre_processors']['options']
@@ -97,6 +109,8 @@
         algorithm = input_json['imblearn_selector']['select_algorithm']
         if algorithm == 'over_sampling.SMOTENC':
             obj = over_sampling.SMOTENC(categorical_features=[])
+        elif algorithm == 'Z_RandomOverSampler':
+            obj = Z_RandomOverSampler()
         else:
             globals = algorithm.split('.')
             mod, klass = globals[0], globals[1]
@@ -105,6 +119,26 @@
         if options != '':
             options = safe_eval( 'dict(' + options + ')' )
             obj.set_params(**options)
+    elif input_json['component_type'] == 'IRAPS':
+        iraps_core = try_get_attr('iraps_classifier','IRAPSCore')()
+        core_params = input_json['text_params'].strip()
+        if core_params != '':
+            try:
+                params = safe_eval('dict(' + core_params + ')')
+            except ValueError:
+                sys.exit("Unsupported parameter input: `%s`" % core_params)
+            iraps_core.set_params(**params)
+        options = {}
+        if input_json['p_thres'] is not None:
+            options['p_thres'] = input_json['p_thres']
+        if input_json['fc_thres'] is not None:
+            options['fc_thres'] = input_json['fc_thres']
+        if input_json['occurrence'] is not None:
+            options['occurrence'] = input_json['occurrence']
+        if input_json['discretize'] is not None:
+            options['discretize'] = input_json['discretize']
+        IRAPSClassifier = try_get_attr('iraps_classifier','IRAPSClassifier')
+        obj = IRAPSClassifier(iraps_core, **options)
     if 'n_jobs' in obj.get_params():
         obj.set_params( n_jobs=N_JOBS )
     return obj, is_imblearn
@@ -113,36 +147,41 @@
 if len(params['pipeline_component']) == 1:
     step_obj, is_imblearn = get_component( params['pipeline_component'][0]['component_selector'])
     if step_obj:
-        pipeline_steps.append( ('preprocessing_1', step_obj) )
+        pipeline_steps.append( step_obj )
         if is_imblearn:
             has_imblearn = True
 else:
     for i, c in enumerate(params['pipeline_component']):
         step_obj, is_imblearn = get_component( c['component_selector'], check_none=True )
-        pipeline_steps.append( ('preprocessing_' + str(i+1), step_obj) )
+        pipeline_steps.append(  step_obj )
         if is_imblearn:
             has_imblearn = True
 
-# Set up final estimator and add to pipeline.
+## Set up final estimator and add to pipeline.
 estimator_json = params['final_estimator']['estimator_selector']
 if estimator_json['selected_module'] == 'none':
     if len(pipeline_steps) == 0:
         sys.exit("No pipeline steps specified!")
-    else:   # turn the last pre-process component to final estimator
-        pipeline_steps[-1] = ('estimator', pipeline_steps[-1][-1])
+    ## else:  turn the last pre-process component to final estimator
 else:
     estimator = get_estimator(estimator_json)
-    pipeline_steps.append( ('estimator', estimator) )
+    pipeline_steps.append( estimator )
 
+#if $output_type == 'Final_Estimator_Builder':
+with open('$outfile', 'wb') as out_handler:
+    final_est = pipeline_steps[-1]
+    print(final_est)
+    pickle.dump(final_est, out_handler, pickle.HIGHEST_PROTOCOL)
+#else:
 if has_imblearn:
-    pipeline = imbPipeline(pipeline_steps)
+    pipeline = imb_make_pipeline(*pipeline_steps)
 else:
-    pipeline = Pipeline(pipeline_steps)
+    pipeline = make_pipeline(*pipeline_steps)
 pprint.pprint(pipeline.named_steps)
 
 with open('$outfile', 'wb') as out_handler:
     pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL)
-
+#end if
             ]]>
         </configfile>
     </configfiles>
@@ -158,6 +197,7 @@
                     <option value="FeatureAgglomeration">Agglomerate Features</option>
                     <option value="skrebate">SK-rebate feature selection</option>
                     <option value="imblearn">imbalanced-learn sampling</option>
+                    <option value="IRAPS">IRAPS -- feature selector and classifier</option>
                 </param>
                 <when value="None"/>
                 <when value="pre_processor">
@@ -184,27 +224,51 @@
                 <when value="imblearn">
                     <expand macro="imbalanced_learn_sampling"/>
                 </when>
+                <when value="IRAPS">
+                    <expand macro="estimator_params_text"
+                        label="Type in parameter settings for IRAPSCore if different from default:"
+                        help="Default(=blank): n_iter=1000, responsive_thres=-1, resistant_thres=0, random_state=None. No double quotes"/>
+                    <param argument="p_thres" type="float" value="0.001" label="P value threshold" help="Float. default=0.001"/>
+                    <param argument="fc_thres" type="float" value="0.1" label="fold change threshold" help="Float. default=0.1"/>
+                    <param argument="occurrence" type="float" value="0.7" label="reservation factor" help="Float. default=0.7"/>
+                    <param argument="discretize" type="float" value="-1" label="The z_score threshold to discretize target value" help="Float. default=-1"/>
+                </when>
             </conditional>
         </repeat>
         <section name="final_estimator" title="Final Estimator" expanded="true">
             <conditional name="estimator_selector">
                 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >
                     <expand macro="estimator_module_options">
-                        <option value="customer_estimator">Load a customer estimator</option>
+                        <option value="binarize_target">Binarize Target Classifier or Regressor</option>
+                        <option value="custom_estimator">Load a custom estimator</option>
                         <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option>
                     </expand>
                 </param>
                 <expand macro="estimator_suboptions">
-                    <when value="customer_estimator">
-                        <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the customer estimator or pipeline:"/>
+                    <when value="binarize_target">
+                        <param name="clf_or_regr" type="select" label="Classifier or Regressor:">
+                            <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option>
+                            <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option>
+                        </param>
+                        <param name="wrapped_estimator" type="data" format="zip" label="Choose the dataset containing the wrapped estimator or pipeline"/>
+                        <param name='z_score' type="float" value="-1" optional="false" label="Discrize target values using z_score"/>
+                        <param name='value' type="float" value="" optional="true" label="Discretize target values using a fixed value instead" help="Optional. default: None."/>
+                        <param name="less_is_positive" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Are the detecting values smaller than others?"/>
+                    </when>
+                    <when value="custom_estimator">
+                        <param name="c_estimator" type="data" format="zip" label="Choose the dataset containing the custom estimator or pipeline"/>
                     </when>
                     <when value="none"/>
                 </expand>
             </conditional>
         </section>
+        <param name="output_type" type="select" label="Output the final estimator instead?">
+            <option value="Pipeline_Builder" selected="true">Pipeline</option>
+            <option value="Final_Estimator_Builder">Final Estimator</option>
+        </param>
     </inputs>
     <outputs>
-        <data format="zip" name="outfile"/>
+        <data format="zip" name="outfile" label="${output_type}"/>
     </outputs>
     <tests>
         <test>
@@ -313,7 +377,7 @@
             </conditional>
             <param name="selected_module" value="ensemble"/>
             <param name="selected_estimator" value="AdaBoostClassifier"/>
-            <output name="outfile" file="pipeline08" compare="sim_size" delta="5"/>
+            <output name="outfile" file="pipeline08" compare="sim_size" delta="20"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -373,6 +437,41 @@
             </section>
             <output name="outfile" file="pipeline12" compare="sim_size" delta="5"/>
         </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="None"/>
+            </conditional>
+            <param name="selected_module" value="ensemble"/>
+            <param name="selected_estimator" value="RandomForestClassifier"/>
+            <param name="output_type" value="Final_Estimator_Builder"/>
+            <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="IRAPS"/>
+            </conditional>
+            <section name="final_estimator">
+                <conditional name="estimator_selector">
+                    <param name="selected_module" value="none"/>
+                </conditional>
+            </section>
+            <param name="output_type" value="Final_Estimator_Builder"/>
+            <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="None"/>
+            </conditional>
+            <section name="final_estimator">
+                <conditional name="estimator_selector">
+                    <param name="selected_module" value="binarize_target"/>
+                    <param name="clf_or_regr" value="BinarizeTargetClassifier"/>
+                    <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/>
+                </conditional>
+            </section>
+            <param name="output_type" value="Final_Estimator_Builder"/>
+            <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/>
+        </test>
     </tests>
     <help>
         <![CDATA[
author	bgruening
date	Tue, 14 May 2019 18:06:37 -0400
parents	99038af8deda
children	775b004b7920