Mercurial > repos > bgruening > sklearn_train_test_split

diff keras_train_and_eval.py @ 11:5da2217cd788 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author: bgruening
date: Wed, 09 Aug 2023 13:33:25 +0000
parents: 3312fb686ffb
children: e10ffa9e0b90
--- a/keras_train_and_eval.py	Thu Aug 11 09:36:17 2022 +0000
+++ b/keras_train_and_eval.py	Wed Aug 09 13:33:25 2023 +0000
@@ -1,34 +1,43 @@
 import argparse
 import json
 import os
-import pickle
 import warnings
 from itertools import chain
 
 import joblib
 import numpy as np
 import pandas as pd
-from galaxy_ml.externals.selene_sdk.utils import compute_score
-from galaxy_ml.keras_galaxy_models import _predict_generator
+from galaxy_ml.keras_galaxy_models import (
+    _predict_generator,
+    KerasGBatchClassifier,
+)
+from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5
 from galaxy_ml.model_validations import train_test_split
-from galaxy_ml.utils import (clean_params, get_main_estimator,
-                             get_module, get_scoring, load_model, read_columns,
-                             SafeEval, try_get_attr)
+from galaxy_ml.utils import (
+    clean_params,
+    gen_compute_scores,
+    get_main_estimator,
+    get_module,
+    get_scoring,
+    read_columns,
+    SafeEval
+)
 from scipy.io import mmread
-from sklearn.metrics.scorer import _check_multimetric_scoring
-from sklearn.model_selection import _search, _validation
+from sklearn.metrics._scorer import _check_multimetric_scoring
 from sklearn.model_selection._validation import _score
-from sklearn.pipeline import Pipeline
-from sklearn.utils import indexable, safe_indexing
-
-_fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")
-setattr(_search, "_fit_and_score", _fit_and_score)
-setattr(_validation, "_fit_and_score", _fit_and_score)
+from sklearn.utils import _safe_indexing, indexable
 
 N_JOBS = int(os.environ.get("GALAXY_SLOTS", 1))
 CACHE_DIR = os.path.join(os.getcwd(), "cached")
-del os
-NON_SEARCHABLE = ("n_jobs", "pre_dispatch", "memory", "_path", "nthread", "callbacks")
+NON_SEARCHABLE = (
+    "n_jobs",
+    "pre_dispatch",
+    "memory",
+    "_path",
+    "_dir",
+    "nthread",
+    "callbacks",
+)
 ALLOWED_CALLBACKS = (
     "EarlyStopping",
     "TerminateOnNaN",
@@ -96,7 +105,7 @@
         train = index_arr[~np.isin(groups, group_names)]
         rval = list(
             chain.from_iterable(
-                (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays
+                (_safe_indexing(a, train), _safe_indexing(a, test)) for a in new_arrays
             )
         )
     else:
@@ -108,68 +117,69 @@
     return rval
 
 
-def _evaluate(y_true, pred_probas, scorer, is_multimetric=True):
-    """output scores based on input scorer
+def _evaluate_keras_and_sklearn_scores(
+    estimator,
+    data_generator,
+    X,
+    y=None,
+    sk_scoring=None,
+    steps=None,
+    batch_size=32,
+    return_predictions=False,
+):
+    """output scores for bother keras and sklearn metrics
 
     Parameters
-    ----------
-    y_true : array
-        True label or target values
-    pred_probas : array
-        Prediction values, probability for classification problem
-    scorer : dict
-        dict of `sklearn.metrics.scorer.SCORER`
-    is_multimetric : bool, default is True
+    -----------
+    estimator : object
+        Fitted `galaxy_ml.keras_galaxy_models.KerasGBatchClassifier`.
+    data_generator : object
+        From `galaxy_ml.preprocessors.ImageDataFrameBatchGenerator`.
+    X : 2-D array
+        Contains indecies of images that need to be evaluated.
+    y : None
+        Target value.
+    sk_scoring : dict
+        Galaxy tool input parameters.
+    steps : integer or None
+        Evaluation/prediction steps before stop.
+    batch_size : integer
+        Number of samples in a batch
+    return_predictions : bool, default is False
+        Whether to return predictions and true labels.
     """
-    if y_true.ndim == 1 or y_true.shape[-1] == 1:
-        pred_probas = pred_probas.ravel()
-        pred_labels = (pred_probas > 0.5).astype("int32")
-        targets = y_true.ravel().astype("int32")
-        if not is_multimetric:
-            preds = (
-                pred_labels
-                if scorer.__class__.__name__ == "_PredictScorer"
-                else pred_probas
-            )
-            score = scorer._score_func(targets, preds, **scorer._kwargs)
+    scores = {}
 
-            return score
-        else:
-            scores = {}
-            for name, one_scorer in scorer.items():
-                preds = (
-                    pred_labels
-                    if one_scorer.__class__.__name__ == "_PredictScorer"
-                    else pred_probas
-                )
-                score = one_scorer._score_func(targets, preds, **one_scorer._kwargs)
-                scores[name] = score
-
-    # TODO: multi-class metrics
-    # multi-label
+    generator = data_generator.flow(X, y=y, batch_size=batch_size)
+    # keras metrics evaluation
+    # handle scorer, convert to scorer dict
+    generator.reset()
+    score_results = estimator.model_.evaluate_generator(generator, steps=steps)
+    metrics_names = estimator.model_.metrics_names
+    if not isinstance(metrics_names, list):
+        scores[metrics_names] = score_results
     else:
-        pred_labels = (pred_probas > 0.5).astype("int32")
-        targets = y_true.astype("int32")
-        if not is_multimetric:
-            preds = (
-                pred_labels
-                if scorer.__class__.__name__ == "_PredictScorer"
-                else pred_probas
-            )
-            score, _ = compute_score(preds, targets, scorer._score_func)
-            return score
-        else:
-            scores = {}
-            for name, one_scorer in scorer.items():
-                preds = (
-                    pred_labels
-                    if one_scorer.__class__.__name__ == "_PredictScorer"
-                    else pred_probas
-                )
-                score, _ = compute_score(preds, targets, one_scorer._score_func)
-                scores[name] = score
+        scores = dict(zip(metrics_names, score_results))
+
+    if sk_scoring["primary_scoring"] == "default" and not return_predictions:
+        return scores
+
+    generator.reset()
+    predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps)
 
-    return scores
+    # for sklearn metrics
+    if sk_scoring["primary_scoring"] != "default":
+        scorer = get_scoring(sk_scoring)
+        if not isinstance(scorer, (dict, list)):
+            scorer = [sk_scoring["primary_scoring"]]
+        scorer = _check_multimetric_scoring(estimator, scoring=scorer)
+        sk_scores = gen_compute_scores(y_true, predictions, scorer)
+        scores.update(sk_scores)
+
+    if return_predictions:
+        return scores, predictions, y_true
+    else:
+        return scores, None, None
 
 
 def main(
@@ -179,7 +189,6 @@
     infile2,
     outfile_result,
     outfile_object=None,
-    outfile_weights=None,
     outfile_y_true=None,
     outfile_y_preds=None,
     groups=None,
@@ -192,46 +201,43 @@
     Parameter
     ---------
     inputs : str
-        File path to galaxy tool parameter
+        File path to galaxy tool parameter.
 
     infile_estimator : str
-        File path to estimator
+        File path to estimator.
 
     infile1 : str
-        File path to dataset containing features
+        File path to dataset containing features.
 
     infile2 : str
-        File path to dataset containing target values
+        File path to dataset containing target values.
 
     outfile_result : str
-        File path to save the results, either cv_results or test result
+        File path to save the results, either cv_results or test result.
 
     outfile_object : str, optional
-        File path to save searchCV object
-
-    outfile_weights : str, optional
-        File path to save deep learning model weights
+        File path to save searchCV object.
 
     outfile_y_true : str, optional
-        File path to target values for prediction
+        File path to target values for prediction.
 
     outfile_y_preds : str, optional
-        File path to save deep learning model weights
+        File path to save predictions.
 
     groups : str
-        File path to dataset containing groups labels
+        File path to dataset containing groups labels.
 
     ref_seq : str
-        File path to dataset containing genome sequence file
+        File path to dataset containing genome sequence file.
 
     intervals : str
-        File path to dataset containing interval file
+        File path to dataset containing interval file.
 
     targets : str
-        File path to dataset compressed target bed file
+        File path to dataset compressed target bed file.
 
     fasta_path : str
-        File path to dataset containing fasta file
+        File path to dataset containing fasta file.
     """
     warnings.simplefilter("ignore")
 
@@ -239,8 +245,7 @@
         params = json.load(param_handler)
 
     #  load estimator
-    with open(infile_estimator, "rb") as estimator_handler:
-        estimator = load_model(estimator_handler)
+    estimator = load_model_from_h5(infile_estimator)
 
     estimator = clean_params(estimator)
 
@@ -333,7 +338,12 @@
         loaded_df[df_key] = infile2
 
     y = read_columns(
-        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+        infile2,
+        c=c,
+        c_option=column_option,
+        sep="\t",
+        header=header,
+        parse_dates=True,
     )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
@@ -387,16 +397,10 @@
 
     # handle scorer, convert to scorer dict
     scoring = params["experiment_schemes"]["metrics"]["scoring"]
-    if scoring is not None:
-        # get_scoring() expects secondary_scoring to be a comma separated string (not a list)
-        # Check if secondary_scoring is specified
-        secondary_scoring = scoring.get("secondary_scoring", None)
-        if secondary_scoring is not None:
-            # If secondary_scoring is specified, convert the list into comman separated string
-            scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"])
-
     scorer = get_scoring(scoring)
-    scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
+    if not isinstance(scorer, (dict, list)):
+        scorer = [scoring["primary_scoring"]]
+    scorer = _check_multimetric_scoring(estimator, scoring=scorer)
 
     # handle test (first) split
     test_split_options = params["experiment_schemes"]["test_split"]["split_algos"]
@@ -411,14 +415,9 @@
                 "Stratified shuffle split is not " "applicable on empty target values!"
             )
 
-    (
-        X_train,
-        X_test,
-        y_train,
-        y_test,
-        groups_train,
-        _groups_test,
-    ) = train_test_split_none(X, y, groups, **test_split_options)
+    X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split_none(
+        X, y, groups, **test_split_options
+    )
 
     exp_scheme = params["experiment_schemes"]["selected_exp_scheme"]
 
@@ -443,11 +442,11 @@
             y_train,
             y_val,
             groups_train,
-            _groups_val,
+            groups_val,
         ) = train_test_split_none(X_train, y_train, groups_train, **val_split_options)
 
     # train and eval
-    if hasattr(estimator, "validation_data"):
+    if hasattr(estimator, "config") and hasattr(estimator, "model_type"):
         if exp_scheme == "train_val_test":
             estimator.fit(X_train, y_train, validation_data=(X_val, y_val))
         else:
@@ -455,25 +454,46 @@
     else:
         estimator.fit(X_train, y_train)
 
-    if hasattr(estimator, "evaluate"):
+    if isinstance(estimator, KerasGBatchClassifier):
+        scores = {}
         steps = estimator.prediction_steps
         batch_size = estimator.batch_size
-        generator = estimator.data_generator_.flow(
-            X_test, y=y_test, batch_size=batch_size
+        data_generator = estimator.data_generator_
+
+        scores, predictions, y_true = _evaluate_keras_and_sklearn_scores(
+            estimator,
+            data_generator,
+            X_test,
+            y=y_test,
+            sk_scoring=scoring,
+            steps=steps,
+            batch_size=batch_size,
+            return_predictions=bool(outfile_y_true),
         )
-        predictions, y_true = _predict_generator(
-            estimator.model_, generator, steps=steps
-        )
-        scores = _evaluate(y_true, predictions, scorer, is_multimetric=True)
 
     else:
+        scores = {}
+        if hasattr(estimator, "model_") and hasattr(estimator.model_, "metrics_names"):
+            batch_size = estimator.batch_size
+            score_results = estimator.model_.evaluate(
+                X_test, y=y_test, batch_size=batch_size, verbose=0
+            )
+            metrics_names = estimator.model_.metrics_names
+            if not isinstance(metrics_names, list):
+                scores[metrics_names] = score_results
+            else:
+                scores = dict(zip(metrics_names, score_results))
+
         if hasattr(estimator, "predict_proba"):
             predictions = estimator.predict_proba(X_test)
         else:
             predictions = estimator.predict(X_test)
 
         y_true = y_test
-        scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)
+        sk_scores = _score(estimator, X_test, y_test, scorer)
+        scores.update(sk_scores)
+
+    # handle output
     if outfile_y_true:
         try:
             pd.DataFrame(y_true).to_csv(outfile_y_true, sep="\t", index=False)
@@ -486,7 +506,6 @@
             )
         except Exception as e:
             print("Error in saving predictions: %s" % e)
-
     # handle output
     for name, score in scores.items():
         scores[name] = [score]
@@ -497,23 +516,7 @@
     memory.clear(warn=False)
 
     if outfile_object:
-        main_est = estimator
-        if isinstance(estimator, Pipeline):
-            main_est = estimator.steps[-1][-1]
-
-        if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"):
-            if outfile_weights:
-                main_est.save_weights(outfile_weights)
-            del main_est.model_
-            del main_est.fit_params
-            del main_est.model_class_
-            if getattr(main_est, "validation_data", None):
-                del main_est.validation_data
-            if getattr(main_est, "data_generator_", None):
-                del main_est.data_generator_
-
-        with open(outfile_object, "wb") as output_handler:
-            pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)
+        dump_model_to_h5(estimator, outfile_object)
 
 
 if __name__ == "__main__":
@@ -524,7 +527,6 @@
     aparser.add_argument("-y", "--infile2", dest="infile2")
     aparser.add_argument("-O", "--outfile_result", dest="outfile_result")
     aparser.add_argument("-o", "--outfile_object", dest="outfile_object")
-    aparser.add_argument("-w", "--outfile_weights", dest="outfile_weights")
     aparser.add_argument("-l", "--outfile_y_true", dest="outfile_y_true")
     aparser.add_argument("-p", "--outfile_y_preds", dest="outfile_y_preds")
     aparser.add_argument("-g", "--groups", dest="groups")
@@ -541,7 +543,6 @@
         args.infile2,
         args.outfile_result,
         outfile_object=args.outfile_object,
-        outfile_weights=args.outfile_weights,
         outfile_y_true=args.outfile_y_true,
         outfile_y_preds=args.outfile_y_preds,
         groups=args.groups,
author	bgruening
date	Wed, 09 Aug 2023 13:33:25 +0000
parents	3312fb686ffb
children	e10ffa9e0b90