Previous changeset 35:e7f047a9dca9 (2021-04-13) Next changeset 37:e13a7c05b3a4 (2021-08-27) |
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717" |
modified:
fitted_model_eval.py keras_deep_learning.py keras_train_and_eval.py ml_visualization_ex.py model_prediction.py search_model_validation.py simple_model_fit.py stacking_ensembles.py to_categorical.py train_test_eval.py train_test_split.py |
added:
association_rules.py label_encoder.py test-data/le_input_w_header.tabular test-data/le_input_wo_header.tabular test-data/le_output.tabular test-data/mba_input_int_w.tabular test-data/mba_input_int_wo.tabular test-data/mba_input_str_w.tabular test-data/mba_input_str_wo.tabular test-data/mba_out_str.tabular test-data/mba_output_int.tabular test-data/mba_output_str.tabular |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece association_rules.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/association_rules.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -0,0 +1,116 @@ +import argparse +import json +import warnings + +import pandas as pd +from mlxtend.frequent_patterns import association_rules, fpgrowth +from mlxtend.preprocessing import TransactionEncoder + + +def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=1.0, min_conviction=1.0, max_length=None): + """ + Parameter + --------- + input : str + File path to galaxy tool parameter + + infile : str + File paths of input vector + + outfile : str + File path to output matrix + + min_support: float + Minimum support + + min_confidence: float + Minimum confidence + + min_lift: float + Minimum lift + + min_conviction: float + Minimum conviction + + max_length: int + Maximum length + + """ + warnings.simplefilter('ignore') + + with open(inputs, 'r') as param_handler: + params = json.load(param_handler) + + input_header = params['header0'] + header = 'infer' if input_header else None + + with open(infile) as fp: + lines = fp.read().splitlines() + + if header is not None: + lines = lines[1:] + + dataset = [] + for line in lines: + line_items = line.split("\t") + dataset.append(line_items) + + # TransactionEncoder learns the unique labels in the dataset and transforms the + # input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array + te = TransactionEncoder() + te_ary = te.fit_transform(dataset) + + # Turn the encoded NumPy array into a DataFrame + df = pd.DataFrame(te_ary, columns=te.columns_) + + # Extract frequent itemsets for association rule mining + # use_colnames: Use DataFrames' column names in the returned DataFrame instead of column indices + frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True, max_len=max_length) + + # Get association rules, with confidence larger than min_confidence + rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence) + + # Filter association rules, keeping rules with lift and conviction larger than min_liftand and min_conviction + rules = rules[(rules['lift'] >= min_lift) & (rules['conviction'] >= min_conviction)] + + # Convert columns from frozenset to list (more readable) + rules['antecedents'] = rules['antecedents'].apply(list) + rules['consequents'] = rules['consequents'].apply(list) + + # The next 3 steps are intended to fix the order of the association + # rules generated, so tests that rely on diff'ing a desired output + # with an expected output can pass + + # 1) Sort entry in every row/column for columns 'antecedents' and 'consequents' + rules['antecedents'] = rules['antecedents'].apply(lambda row: sorted(row)) + rules['consequents'] = rules['consequents'].apply(lambda row: sorted(row)) + + # 2) Create two temporary string columns to sort on + rules['ant_str'] = rules['antecedents'].apply(lambda row: " ".join(row)) + rules['con_str'] = rules['consequents'].apply(lambda row: " ".join(row)) + + # 3) Sort results so they are re-producable + rules.sort_values(by=['ant_str', 'con_str'], inplace=True) + del rules['ant_str'] + del rules['con_str'] + rules.reset_index(drop=True, inplace=True) + + # Write association rules and metrics to file + rules.to_csv(outfile, sep="\t", index=False) + + +if __name__ == '__main__': + aparser = argparse.ArgumentParser() + aparser.add_argument("-i", "--inputs", dest="inputs", required=True) + aparser.add_argument("-y", "--infile", dest="infile", required=True) + aparser.add_argument("-o", "--outfile", dest="outfile", required=True) + aparser.add_argument("-s", "--support", dest="support", default=0.5) + aparser.add_argument("-c", "--confidence", dest="confidence", default=0.5) + aparser.add_argument("-l", "--lift", dest="lift", default=1.0) + aparser.add_argument("-v", "--conviction", dest="conviction", default=1.0) + aparser.add_argument("-t", "--length", dest="length", default=5) + args = aparser.parse_args() + + main(args.inputs, args.infile, args.outfile, + min_support=float(args.support), min_confidence=float(args.confidence), + min_lift=float(args.lift), min_conviction=float(args.conviction), max_length=int(args.length)) |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece fitted_model_eval.py --- a/fitted_model_eval.py Tue Apr 13 22:08:10 2021 +0000 +++ b/fitted_model_eval.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -30,7 +30,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -52,7 +54,9 @@ # Get target y header = "infer" if params["input_options"]["header2"] else None - column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + column_option = params["input_options"]["column_selector_options_2"][ + "selected_column_selector_option2" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -70,7 +74,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() @@ -123,7 +129,8 @@ if hasattr(main_est, "config") and hasattr(main_est, "load_weights"): if not infile_weights or infile_weights == "None": raise ValueError( - "The selected model skeleton asks for weights, " "but no dataset for weights was provided!" + "The selected model skeleton asks for weights, " + "but no dataset for weights was provided!" ) main_est.load_weights(infile_weights) @@ -142,7 +149,9 @@ scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer) if hasattr(estimator, "evaluate"): - scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True) + scores = estimator.evaluate( + X_test, y_test=y_test, scorer=scorer, is_multimetric=True + ) else: scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True) |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece keras_deep_learning.py --- a/keras_deep_learning.py Tue Apr 13 22:08:10 2021 +0000 +++ b/keras_deep_learning.py Sat May 01 00:48:46 2021 +0000 |
[ |
b'@@ -10,12 +10,12 @@\n from galaxy_ml.utils import get_search_params, SafeEval, try_get_attr\n from keras.models import Model, Sequential\n \n-\n safe_eval = SafeEval()\n \n \n def _handle_shape(literal):\n- """Eval integer or list/tuple of integers from string\n+ """\n+ Eval integer or list/tuple of integers from string\n \n Parameters:\n -----------\n@@ -32,7 +32,8 @@\n \n \n def _handle_regularizer(literal):\n- """Construct regularizer from string literal\n+ """\n+ Construct regularizer from string literal\n \n Parameters\n ----------\n@@ -48,15 +49,16 @@\n return None\n \n if l1 is None:\n- l1 = 0.\n+ l1 = 0.0\n if l2 is None:\n- l2 = 0.\n+ l2 = 0.0\n \n return keras.regularizers.l1_l2(l1=l1, l2=l2)\n \n \n def _handle_constraint(config):\n- """Construct constraint from galaxy tool parameters.\n+ """\n+ Construct constraint from galaxy tool parameters.\n Suppose correct dictionary format\n \n Parameters\n@@ -72,14 +74,14 @@\n "MinMaxNorm"\n }\n """\n- constraint_type = config[\'constraint_type\']\n- if constraint_type in (\'None\', \'\'):\n+ constraint_type = config["constraint_type"]\n+ if constraint_type in ("None", ""):\n return None\n \n klass = getattr(keras.constraints, constraint_type)\n- options = config.get(\'constraint_options\', {})\n- if \'axis\' in options:\n- options[\'axis\'] = literal_eval(options[\'axis\'])\n+ options = config.get("constraint_options", {})\n+ if "axis" in options:\n+ options["axis"] = literal_eval(options["axis"])\n \n return klass(**options)\n \n@@ -89,62 +91,82 @@\n \n \n def _handle_layer_parameters(params):\n- """Access to handle all kinds of parameters\n+ """\n+ Access to handle all kinds of parameters\n """\n for key, value in six.iteritems(params):\n- if value in (\'None\', \'\'):\n+ if value in ("None", ""):\n params[key] = None\n continue\n \n- if type(value) in [int, float, bool]\\\n- or (type(value) is str and value.isalpha()):\n+ if type(value) in [int, float, bool] or (\n+ type(value) is str and value.isalpha()\n+ ):\n continue\n \n- if key in [\'input_shape\', \'noise_shape\', \'shape\', \'batch_shape\',\n- \'target_shape\', \'dims\', \'kernel_size\', \'strides\',\n- \'dilation_rate\', \'output_padding\', \'cropping\', \'size\',\n- \'padding\', \'pool_size\', \'axis\', \'shared_axes\'] \\\n- and isinstance(value, str):\n+ if (\n+ key\n+ in [\n+ "input_shape",\n+ "noise_shape",\n+ "shape",\n+ "batch_shape",\n+ "target_shape",\n+ "dims",\n+ "kernel_size",\n+ "strides",\n+ "dilation_rate",\n+ "output_padding",\n+ "cropping",\n+ "size",\n+ "padding",\n+ "pool_size",\n+ "axis",\n+ "shared_axes",\n+ ]\n+ and isinstance(value, str)\n+ ):\n params[key] = _handle_shape(value)\n \n- elif key.endswith(\'_regularizer\') and isinstance(value, dict):\n+ elif key.endswith("_regularizer") and isinstance(value, dict):\n params[key] = _handle_regularizer(value)\n \n- elif key.endswith(\'_constraint\') and isinstance(value, dict):\n+ elif key.endswith("_constraint") and isinstance(value, dict):\n params[key] = _handle_constraint(value)\n \n- elif key == \'function\': # No support for lambda/function eval\n+ elif key == "function": # No support for lambda/function eval\n params.pop(key)\n \n return params\n \n \n def get_sequential_model(config):\n- """Construct keras Sequential model from Galaxy tool parameters\n+ """\n+ Construct keras Sequential model from Galaxy tool parameters\n \n Parameters:\n -----------\n '..b'mizer_selection"][\n+ "optimizer_options"\n+ ]\n+ )\n+ )\n \n- train_metrics = inputs[\'mode_selection\'][\'compile_params\'][\'metrics\']\n- if train_metrics[-1] == \'none\':\n+ train_metrics = inputs["mode_selection"]["compile_params"]["metrics"]\n+ if train_metrics[-1] == "none":\n train_metrics = train_metrics[:-1]\n- options[\'metrics\'] = train_metrics\n+ options["metrics"] = train_metrics\n \n- options.update(inputs[\'mode_selection\'][\'fit_params\'])\n- options[\'seed\'] = inputs[\'mode_selection\'][\'random_seed\']\n+ options.update(inputs["mode_selection"]["fit_params"])\n+ options["seed"] = inputs["mode_selection"]["random_seed"]\n \n if batch_mode:\n- generator = get_batch_generator(inputs[\'mode_selection\']\n- [\'generator_selection\'])\n- options[\'data_batch_generator\'] = generator\n- options[\'prediction_steps\'] = \\\n- inputs[\'mode_selection\'][\'prediction_steps\']\n- options[\'class_positive_factor\'] = \\\n- inputs[\'mode_selection\'][\'class_positive_factor\']\n+ generator = get_batch_generator(\n+ inputs["mode_selection"]["generator_selection"]\n+ )\n+ options["data_batch_generator"] = generator\n+ options["prediction_steps"] = inputs["mode_selection"]["prediction_steps"]\n+ options["class_positive_factor"] = inputs["mode_selection"][\n+ "class_positive_factor"\n+ ]\n estimator = klass(config, **options)\n if outfile_params:\n hyper_params = get_search_params(estimator)\n # TODO: remove this after making `verbose` tunable\n for h_param in hyper_params:\n- if h_param[1].endswith(\'verbose\'):\n- h_param[0] = \'@\'\n- df = pd.DataFrame(hyper_params, columns=[\'\', \'Parameter\', \'Value\'])\n- df.to_csv(outfile_params, sep=\'\\t\', index=False)\n+ if h_param[1].endswith("verbose"):\n+ h_param[0] = "@"\n+ df = pd.DataFrame(hyper_params, columns=["", "Parameter", "Value"])\n+ df.to_csv(outfile_params, sep="\\t", index=False)\n \n print(repr(estimator))\n # save model by pickle\n- with open(outfile, \'wb\') as f:\n+ with open(outfile, "wb") as f:\n pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL)\n \n \n-if __name__ == \'__main__\':\n- warnings.simplefilter(\'ignore\')\n+if __name__ == "__main__":\n+ warnings.simplefilter("ignore")\n \n aparser = argparse.ArgumentParser()\n aparser.add_argument("-i", "--inputs", dest="inputs", required=True)\n@@ -345,7 +383,7 @@\n args = aparser.parse_args()\n \n input_json_path = args.inputs\n- with open(input_json_path, \'r\') as param_handler:\n+ with open(input_json_path, "r") as param_handler:\n inputs = json.load(param_handler)\n \n tool_id = args.tool_id\n@@ -355,18 +393,20 @@\n infile_weights = args.infile_weights\n \n # for keras_model_config tool\n- if tool_id == \'keras_model_config\':\n+ if tool_id == "keras_model_config":\n config_keras_model(inputs, outfile)\n \n # for keras_model_builder tool\n else:\n batch_mode = False\n- if tool_id == \'keras_batch_models\':\n+ if tool_id == "keras_batch_models":\n batch_mode = True\n \n- build_keras_model(inputs=inputs,\n- model_json=model_json,\n- infile_weights=infile_weights,\n- batch_mode=batch_mode,\n- outfile=outfile,\n- outfile_params=outfile_params)\n+ build_keras_model(\n+ inputs=inputs,\n+ model_json=model_json,\n+ infile_weights=infile_weights,\n+ batch_mode=batch_mode,\n+ outfile=outfile,\n+ outfile_params=outfile_params,\n+ )\n' |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece keras_train_and_eval.py --- a/keras_train_and_eval.py Tue Apr 13 22:08:10 2021 +0000 +++ b/keras_train_and_eval.py Sat May 01 00:48:46 2021 +0000 |
[ |
b'@@ -11,16 +11,9 @@\n from galaxy_ml.externals.selene_sdk.utils import compute_score\n from galaxy_ml.keras_galaxy_models import _predict_generator\n from galaxy_ml.model_validations import train_test_split\n-from galaxy_ml.utils import (\n- clean_params,\n- get_main_estimator,\n- get_module,\n- get_scoring,\n- load_model,\n- read_columns,\n- SafeEval,\n- try_get_attr,\n-)\n+from galaxy_ml.utils import (clean_params, get_main_estimator,\n+ get_module, get_scoring, load_model, read_columns,\n+ SafeEval, try_get_attr)\n from scipy.io import mmread\n from sklearn.metrics.scorer import _check_multimetric_scoring\n from sklearn.model_selection import _search, _validation\n@@ -28,7 +21,6 @@\n from sklearn.pipeline import Pipeline\n from sklearn.utils import indexable, safe_indexing\n \n-\n _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")\n setattr(_search, "_fit_and_score", _fit_and_score)\n setattr(_validation, "_fit_and_score", _fit_and_score)\n@@ -56,7 +48,10 @@\n \n param_name = p["sp_name"]\n if param_name.lower().endswith(NON_SEARCHABLE):\n- warnings.warn("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)\n+ warnings.warn(\n+ "Warning: `%s` is not eligible for search and was "\n+ "omitted!" % param_name\n+ )\n continue\n \n if not swap_value.startswith(":"):\n@@ -99,7 +94,11 @@\n index_arr = np.arange(n_samples)\n test = index_arr[np.isin(groups, group_names)]\n train = index_arr[~np.isin(groups, group_names)]\n- rval = list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays))\n+ rval = list(\n+ chain.from_iterable(\n+ (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays\n+ )\n+ )\n else:\n rval = train_test_split(*new_arrays, **kwargs)\n \n@@ -127,14 +126,22 @@\n pred_labels = (pred_probas > 0.5).astype("int32")\n targets = y_true.ravel().astype("int32")\n if not is_multimetric:\n- preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas\n+ preds = (\n+ pred_labels\n+ if scorer.__class__.__name__ == "_PredictScorer"\n+ else pred_probas\n+ )\n score = scorer._score_func(targets, preds, **scorer._kwargs)\n \n return score\n else:\n scores = {}\n for name, one_scorer in scorer.items():\n- preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas\n+ preds = (\n+ pred_labels\n+ if one_scorer.__class__.__name__ == "_PredictScorer"\n+ else pred_probas\n+ )\n score = one_scorer._score_func(targets, preds, **one_scorer._kwargs)\n scores[name] = score\n \n@@ -144,13 +151,21 @@\n pred_labels = (pred_probas > 0.5).astype("int32")\n targets = y_true.astype("int32")\n if not is_multimetric:\n- preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas\n+ preds = (\n+ pred_labels\n+ if scorer.__class__.__name__ == "_PredictScorer"\n+ else pred_probas\n+ )\n score, _ = compute_score(preds, targets, scorer._score_func)\n return score\n else:\n scores = {}\n for name, one_scorer in scorer.items():\n- preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas\n+ preds = (\n+ pred_labels\n+ if one_scorer.__class__.__name__ == "_PredictScorer"\n+ else pred_probas\n+ )\n score, _ = compute_sco'..b'put_options"]["column_selector_options_2"][\n+ "selected_column_selector_option2"\n+ ]\n if column_option in [\n "by_index_number",\n "all_but_by_index_number",\n@@ -313,12 +332,9 @@\n infile2 = pd.read_csv(infile2, sep="\\t", header=header, parse_dates=True)\n loaded_df[df_key] = infile2\n \n- y = read_columns(infile2,\n- c=c,\n- c_option=column_option,\n- sep=\'\\t\',\n- header=header,\n- parse_dates=True)\n+ y = read_columns(\n+ infile2, c=c, c_option=column_option, sep="\\t", header=header, parse_dates=True\n+ )\n if len(y.shape) == 2 and y.shape[1] == 1:\n y = y.ravel()\n if input_type == "refseq_and_interval":\n@@ -328,10 +344,14 @@\n \n # load groups\n if groups:\n- groups_selector = (params["experiment_schemes"]["test_split"]["split_algos"]).pop("groups_selector")\n+ groups_selector = (\n+ params["experiment_schemes"]["test_split"]["split_algos"]\n+ ).pop("groups_selector")\n \n header = "infer" if groups_selector["header_g"] else None\n- column_option = groups_selector["column_selector_options_g"]["selected_column_selector_option_g"]\n+ column_option = groups_selector["column_selector_options_g"][\n+ "selected_column_selector_option_g"\n+ ]\n if column_option in [\n "by_index_number",\n "all_but_by_index_number",\n@@ -346,12 +366,14 @@\n if df_key in loaded_df:\n groups = loaded_df[df_key]\n \n- groups = read_columns(groups,\n- c=c,\n- c_option=column_option,\n- sep=\'\\t\',\n- header=header,\n- parse_dates=True)\n+ groups = read_columns(\n+ groups,\n+ c=c,\n+ c_option=column_option,\n+ sep="\\t",\n+ header=header,\n+ parse_dates=True,\n+ )\n groups = groups.ravel()\n \n # del loaded_df\n@@ -364,7 +386,7 @@\n main_est.set_params(memory=memory)\n \n # handle scorer, convert to scorer dict\n- scoring = params[\'experiment_schemes\'][\'metrics\'][\'scoring\']\n+ scoring = params["experiment_schemes"]["metrics"]["scoring"]\n if scoring is not None:\n # get_scoring() expects secondary_scoring to be a comma separated string (not a list)\n # Check if secondary_scoring is specified\n@@ -385,7 +407,9 @@\n if y is not None:\n test_split_options["labels"] = y\n else:\n- raise ValueError("Stratified shuffle split is not " "applicable on empty target values!")\n+ raise ValueError(\n+ "Stratified shuffle split is not " "applicable on empty target values!"\n+ )\n \n (\n X_train,\n@@ -408,7 +432,10 @@\n if y_train is not None:\n val_split_options["labels"] = y_train\n else:\n- raise ValueError("Stratified shuffle split is not " "applicable on empty target values!")\n+ raise ValueError(\n+ "Stratified shuffle split is not "\n+ "applicable on empty target values!"\n+ )\n \n (\n X_train,\n@@ -431,8 +458,12 @@\n if hasattr(estimator, "evaluate"):\n steps = estimator.prediction_steps\n batch_size = estimator.batch_size\n- generator = estimator.data_generator_.flow(X_test, y=y_test, batch_size=batch_size)\n- predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps)\n+ generator = estimator.data_generator_.flow(\n+ X_test, y=y_test, batch_size=batch_size\n+ )\n+ predictions, y_true = _predict_generator(\n+ estimator.model_, generator, steps=steps\n+ )\n scores = _evaluate(y_true, predictions, scorer, is_multimetric=True)\n \n else:\n' |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece label_encoder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/label_encoder.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -0,0 +1,48 @@ +import argparse +import json +import warnings + +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder + + +def main(inputs, infile, outfile): + """ + Parameter + --------- + input : str + File path to galaxy tool parameter + + infile : str + File paths of input vector + + outfile : str + File path to output vector + + """ + warnings.simplefilter('ignore') + + with open(inputs, 'r') as param_handler: + params = json.load(param_handler) + + input_header = params['header0'] + header = 'infer' if input_header else None + + input_vector = pd.read_csv(infile, sep='\t', header=header) + + le = LabelEncoder() + + output_vector = le.fit_transform(input_vector) + + np.savetxt(outfile, output_vector, fmt="%d", delimiter='\t') + + +if __name__ == '__main__': + aparser = argparse.ArgumentParser() + aparser.add_argument("-i", "--inputs", dest="inputs", required=True) + aparser.add_argument("-y", "--infile", dest="infile") + aparser.add_argument("-o", "--outfile", dest="outfile") + args = aparser.parse_args() + + main(args.inputs, args.infile, args.outfile) |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece ml_visualization_ex.py --- a/ml_visualization_ex.py Tue Apr 13 22:08:10 2021 +0000 +++ b/ml_visualization_ex.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -13,10 +13,10 @@ from keras.models import model_from_json from keras.utils import plot_model from sklearn.feature_selection.base import SelectorMixin -from sklearn.metrics import auc, average_precision_score, confusion_matrix, precision_recall_curve, roc_curve +from sklearn.metrics import (auc, average_precision_score, confusion_matrix, + precision_recall_curve, roc_curve) from sklearn.pipeline import Pipeline - safe_eval = SafeEval() # plotly default colors @@ -51,7 +51,9 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label) + precision, recall, _ = precision_recall_curve( + y_true, y_score, pos_label=pos_label + ) ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1) trace = go.Scatter( @@ -111,7 +113,9 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label) + precision, recall, _ = precision_recall_curve( + y_true, y_score, pos_label=pos_label + ) ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1) plt.step( @@ -155,7 +159,9 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate) + fpr, tpr, _ = roc_curve( + y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate + ) roc_auc = auc(fpr, tpr) trace = go.Scatter( @@ -168,7 +174,9 @@ data.append(trace) layout = go.Layout( - xaxis=dict(title="False Positive Rate", linecolor="lightslategray", linewidth=1), + xaxis=dict( + title="False Positive Rate", linecolor="lightslategray", linewidth=1 + ), yaxis=dict(title="True Positive Rate", linecolor="lightslategray", linewidth=1), title=dict( text=title or "Receiver Operating Characteristic (ROC) Curve", @@ -204,7 +212,9 @@ os.rename("output.html", "output") -def visualize_roc_curve_matplotlib(df1, df2, pos_label, drop_intermediate=True, title=None): +def visualize_roc_curve_matplotlib( + df1, df2, pos_label, drop_intermediate=True, title=None +): """visualize roc-curve using matplotlib and output svg image""" backend = matplotlib.get_backend() if "inline" not in backend: @@ -216,7 +226,9 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate) + fpr, tpr, _ = roc_curve( + y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate + ) roc_auc = auc(fpr, tpr) plt.step( @@ -253,11 +265,15 @@ col = plot_selection[column_name]["col1"] else: col = None - _, input_df = read_columns(file_path, c=col, - c_option=column_option, - return_df=True, - sep='\t', header=header, - parse_dates=True) + _, input_df = read_columns( + file_path, + c=col, + c_option=column_option, + return_df=True, + sep="\t", + header=header, + parse_dates=True, + ) return input_df @@ -344,7 +360,9 @@ with open(infile_estimator, "rb") as estimator_handler: estimator = load_model(estimator_handler) - column_option = params["plotting_selection"]["column_selector_options"]["selected_column_selector_option"] + column_option = params["plotting_selection"]["column_selector_options"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -379,7 +397,11 @@ else: coefs = getattr(estimator, "feature_importances_", None) if coefs is None: - raise RuntimeError("The classifier does not expose " '"coef_" or "feature_importances_" ' "attributes") + raise RuntimeError( + "The classifier does not expose " + '"coef_" or "feature_importances_" ' + "attributes" + ) threshold = params["plotting_selection"]["threshold"] if threshold is not None: @@ -454,7 +476,9 @@ layout = go.Layout( xaxis=dict(title="Number of features selected"), yaxis=dict(title="Cross validation score"), - title=dict(text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top"), + title=dict( + text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top" + ), font=dict(family="sans-serif", size=11), # control backgroud colors plot_bgcolor="rgba(255,255,255,0)", @@ -548,9 +572,13 @@ elif plot_type == "classification_confusion_matrix": plot_selection = params["plotting_selection"] - input_true = get_dataframe(true_labels, plot_selection, "header_true", "column_selector_options_true") + input_true = get_dataframe( + true_labels, plot_selection, "header_true", "column_selector_options_true" + ) header_predicted = "infer" if plot_selection["header_predicted"] else None - input_predicted = pd.read_csv(predicted_labels, sep="\t", parse_dates=True, header=header_predicted) + input_predicted = pd.read_csv( + predicted_labels, sep="\t", parse_dates=True, header=header_predicted + ) true_classes = input_true.iloc[:, -1].copy() predicted_classes = input_predicted.iloc[:, -1].copy() axis_labels = list(set(true_classes)) |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece model_prediction.py --- a/model_prediction.py Tue Apr 13 22:08:10 2021 +0000 +++ b/model_prediction.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -63,7 +63,8 @@ if hasattr(main_est, "config") and hasattr(main_est, "load_weights"): if not infile_weights or infile_weights == "None": raise ValueError( - "The selected model skeleton asks for weights, " "but dataset for weights wan not selected!" + "The selected model skeleton asks for weights, " + "but dataset for weights wan not selected!" ) main_est.load_weights(infile_weights) @@ -72,7 +73,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -122,9 +125,13 @@ pred_data_generator = klass(fasta_path, seq_length=seq_length) if params["method"] == "predict": - preds = estimator.predict(X, data_generator=pred_data_generator, steps=steps) + preds = estimator.predict( + X, data_generator=pred_data_generator, steps=steps + ) else: - preds = estimator.predict_proba(X, data_generator=pred_data_generator, steps=steps) + preds = estimator.predict_proba( + X, data_generator=pred_data_generator, steps=steps + ) # vcf input elif input_type == "variant_effect": @@ -135,7 +142,9 @@ if options["blacklist_regions"] == "none": options["blacklist_regions"] = None - pred_data_generator = klass(ref_genome_path=ref_seq, vcf_path=vcf_path, **options) + pred_data_generator = klass( + ref_genome_path=ref_seq, vcf_path=vcf_path, **options + ) pred_data_generator.set_processing_attrs() |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece search_model_validation.py --- a/search_model_validation.py Tue Apr 13 22:08:10 2021 +0000 +++ b/search_model_validation.py Sat May 01 00:48:46 2021 +0000 |
[ |
b'@@ -11,31 +11,16 @@\n import numpy as np\n import pandas as pd\n import skrebate\n-from galaxy_ml.utils import (\n- clean_params,\n- get_cv,\n- get_main_estimator,\n- get_module,\n- get_scoring,\n- load_model,\n- read_columns,\n- SafeEval,\n- try_get_attr\n-)\n+from galaxy_ml.utils import (clean_params, get_cv,\n+ get_main_estimator, get_module, get_scoring,\n+ load_model, read_columns, SafeEval, try_get_attr)\n from scipy.io import mmread\n-from sklearn import (\n- cluster,\n- decomposition,\n- feature_selection,\n- kernel_approximation,\n- model_selection,\n- preprocessing,\n-)\n+from sklearn import (cluster, decomposition, feature_selection,\n+ kernel_approximation, model_selection, preprocessing)\n from sklearn.exceptions import FitFailedWarning\n from sklearn.model_selection import _search, _validation\n from sklearn.model_selection._validation import _score, cross_validate\n \n-\n _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")\n setattr(_search, "_fit_and_score", _fit_and_score)\n setattr(_validation, "_fit_and_score", _fit_and_score)\n@@ -57,7 +42,10 @@\n \n param_name = p["sp_name"]\n if param_name.lower().endswith(NON_SEARCHABLE):\n- print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)\n+ print(\n+ "Warning: `%s` is not eligible for search and was "\n+ "omitted!" % param_name\n+ )\n continue\n \n if not search_list.startswith(":"):\n@@ -90,7 +78,9 @@\n decomposition.IncrementalPCA(),\n decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),\n decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS),\n- decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS),\n+ decomposition.MiniBatchDictionaryLearning(\n+ random_state=0, n_jobs=N_JOBS\n+ ),\n decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS),\n decomposition.NMF(random_state=0),\n decomposition.PCA(random_state=0),\n@@ -107,14 +97,26 @@\n skrebate.MultiSURF(n_jobs=N_JOBS),\n skrebate.MultiSURFstar(n_jobs=N_JOBS),\n imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS),\n- imblearn.under_sampling.CondensedNearestNeighbour(random_state=0, n_jobs=N_JOBS),\n- imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),\n- imblearn.under_sampling.RepeatedEditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),\n+ imblearn.under_sampling.CondensedNearestNeighbour(\n+ random_state=0, n_jobs=N_JOBS\n+ ),\n+ imblearn.under_sampling.EditedNearestNeighbours(\n+ random_state=0, n_jobs=N_JOBS\n+ ),\n+ imblearn.under_sampling.RepeatedEditedNearestNeighbours(\n+ random_state=0, n_jobs=N_JOBS\n+ ),\n imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),\n- imblearn.under_sampling.InstanceHardnessThreshold(random_state=0, n_jobs=N_JOBS),\n+ imblearn.under_sampling.InstanceHardnessThreshold(\n+ random_state=0, n_jobs=N_JOBS\n+ ),\n imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS),\n- imblearn.under_sampling.NeighbourhoodCleaningRule(random_state=0, n_jobs=N_JOBS),\n- imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS),\n+ imblearn.under_sampling.NeighbourhoodCleaningRule(\n+ random_state=0, n_jobs=N_JOBS\n+ ),\n+ imblearn.under_sampling.OneSidedSelection(\n+ random_state=0'..b' split_options["shuffle"] = None\n@@ -411,9 +425,13 @@\n \n # TODO Solve deep learning models in pipeline\n if best_estimator_.__class__.__name__ == "KerasGBatchClassifier":\n- test_score = best_estimator_.evaluate(X_test, scorer=scorer_, is_multimetric=is_multimetric)\n+ test_score = best_estimator_.evaluate(\n+ X_test, scorer=scorer_, is_multimetric=is_multimetric\n+ )\n else:\n- test_score = _score(best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric)\n+ test_score = _score(\n+ best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric\n+ )\n \n if not is_multimetric:\n test_score = {primary_scoring: test_score}\n@@ -487,7 +505,9 @@\n params = json.load(param_handler)\n \n # Override the refit parameter\n- params["search_schemes"]["options"]["refit"] = True if params["save"] != "nope" else False\n+ params["search_schemes"]["options"]["refit"] = (\n+ True if params["save"] != "nope" else False\n+ )\n \n with open(infile_estimator, "rb") as estimator_handler:\n estimator = load_model(estimator_handler)\n@@ -499,17 +519,21 @@\n options = params["search_schemes"]["options"]\n \n if groups:\n- header = "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None\n- column_option = options["cv_selector"]["groups_selector"]["column_selector_options_g"][\n- "selected_column_selector_option_g"\n- ]\n+ header = (\n+ "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None\n+ )\n+ column_option = options["cv_selector"]["groups_selector"][\n+ "column_selector_options_g"\n+ ]["selected_column_selector_option_g"]\n if column_option in [\n "by_index_number",\n "all_but_by_index_number",\n "by_header_name",\n "all_but_by_header_name",\n ]:\n- c = options["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"]\n+ c = options["cv_selector"]["groups_selector"]["column_selector_options_g"][\n+ "col_g"\n+ ]\n else:\n c = None\n \n@@ -537,12 +561,14 @@\n secondary_scoring = options["scoring"].get("secondary_scoring", None)\n if secondary_scoring is not None:\n # If secondary_scoring is specified, convert the list into comman separated string\n- options["scoring"]["secondary_scoring"] = ",".join(options["scoring"]["secondary_scoring"])\n+ options["scoring"]["secondary_scoring"] = ",".join(\n+ options["scoring"]["secondary_scoring"]\n+ )\n options["scoring"] = get_scoring(options["scoring"])\n if options["error_score"]:\n options["error_score"] = "raise"\n else:\n- options["error_score"] = np.NaN\n+ options["error_score"] = np.nan\n if options["refit"] and isinstance(options["scoring"], dict):\n options["refit"] = primary_scoring\n if "pre_dispatch" in options and options["pre_dispatch"] == "":\n@@ -588,7 +614,9 @@\n # make sure refit is choosen\n # this could be True for sklearn models, but not the case for\n # deep learning models\n- if not options["refit"] and not all(hasattr(estimator, attr) for attr in ("config", "model_type")):\n+ if not options["refit"] and not all(\n+ hasattr(estimator, attr) for attr in ("config", "model_type")\n+ ):\n warnings.warn("Refit is change to `True` for nested validation!")\n setattr(searcher, "refit", True)\n \n@@ -687,7 +715,9 @@\n \n cv_results = pd.DataFrame(searcher.cv_results_)\n cv_results = cv_results[sorted(cv_results.columns)]\n- cv_results.to_csv(path_or_buf=outfile_result, sep="\\t", header=True, index=False)\n+ cv_results.to_csv(\n+ path_or_buf=outfile_result, sep="\\t", header=True, index=False\n+ )\n \n memory.clear(warn=False)\n \n' |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece simple_model_fit.py --- a/simple_model_fit.py Tue Apr 13 22:08:10 2021 +0000 +++ b/simple_model_fit.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -7,7 +7,6 @@ from scipy.io import mmread from sklearn.pipeline import Pipeline - N_JOBS = int(__import__("os").environ.get("GALAXY_SLOTS", 1)) @@ -36,7 +35,7 @@ if name == "memory" or name.endswith("__memory") or name.endswith("_path"): new_p = {name: None} estimator.set_params(**new_p) - elif n_jobs is not None and (name == 'n_jobs' or name.endswith('__n_jobs')): + elif n_jobs is not None and (name == "n_jobs" or name.endswith("__n_jobs")): new_p = {name: n_jobs} estimator.set_params(**new_p) elif name.endswith("callbacks"): @@ -68,7 +67,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -90,7 +91,9 @@ # Get target y header = "infer" if params["input_options"]["header2"] else None - column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + column_option = params["input_options"]["column_selector_options_2"][ + "selected_column_selector_option2" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -108,12 +111,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece stacking_ensembles.py --- a/stacking_ensembles.py Tue Apr 13 22:08:10 2021 +0000 +++ b/stacking_ensembles.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -8,8 +8,8 @@ import mlxtend.classifier import mlxtend.regressor import pandas as pd -from galaxy_ml.utils import get_cv, get_estimator, get_search_params, load_model - +from galaxy_ml.utils import (get_cv, get_estimator, get_search_params, + load_model) warnings.filterwarnings("ignore") @@ -62,7 +62,9 @@ with open(meta_path, "rb") as f: meta_estimator = load_model(f) else: - estimator_json = params["algo_selection"]["meta_estimator"]["estimator_selector"] + estimator_json = params["algo_selection"]["meta_estimator"][ + "estimator_selector" + ] meta_estimator = get_estimator(estimator_json) options = params["algo_selection"]["options"] @@ -89,10 +91,14 @@ ensemble_estimator = klass(base_estimators, **options) elif mod == mlxtend.classifier: - ensemble_estimator = klass(classifiers=base_estimators, meta_classifier=meta_estimator, **options) + ensemble_estimator = klass( + classifiers=base_estimators, meta_classifier=meta_estimator, **options + ) else: - ensemble_estimator = klass(regressors=base_estimators, meta_regressor=meta_estimator, **options) + ensemble_estimator = klass( + regressors=base_estimators, meta_regressor=meta_estimator, **options + ) print(ensemble_estimator) for base_est in base_estimators: |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/le_input_w_header.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/le_input_w_header.tabular Sat May 01 00:48:46 2021 +0000 |
b |
@@ -0,0 +1,5 @@ +Class +Liverpool +Real Madrid +Bayern Munich +A.C. Milan |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/le_input_wo_header.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/le_input_wo_header.tabular Sat May 01 00:48:46 2021 +0000 |
b |
@@ -0,0 +1,4 @@ +Liverpool +Real Madrid +Bayern Munich +A.C. Milan |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/le_output.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/le_output.tabular Sat May 01 00:48:46 2021 +0000 |
b |
@@ -0,0 +1,4 @@ +2 +3 +1 +0 |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/mba_input_int_w.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_input_int_w.tabular Sat May 01 00:48:46 2021 +0000 |
b |
@@ -0,0 +1,6 @@ +Transactions +10 11 12 13 14 15 +16 11 12 13 14 15 +10 17 13 14 +10 18 19 13 15 +19 11 11 13 20 14 |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/mba_input_int_wo.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_input_int_wo.tabular Sat May 01 00:48:46 2021 +0000 |
b |
@@ -0,0 +1,5 @@ +10 11 12 13 14 15 +16 11 12 13 14 15 +10 17 13 14 +10 18 19 13 15 +19 11 11 13 20 14 |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/mba_input_str_w.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_input_str_w.tabular Sat May 01 00:48:46 2021 +0000 |
b |
@@ -0,0 +1,6 @@ +Transactions +Milk Onion Nutmeg Kidney Beans Eggs Yogurt +Dill Onion Nutmeg Kidney Beans Eggs Yogurt +Milk Apple Kidney Beans Eggs +Milk Unicorn Corn Kidney Beans Yogurt +Corn Onion Onion Kidney Beans Ice cream Eggs |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/mba_input_str_wo.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_input_str_wo.tabular Sat May 01 00:48:46 2021 +0000 |
b |
@@ -0,0 +1,5 @@ +Milk Onion Nutmeg Kidney Beans Eggs Yogurt +Dill Onion Nutmeg Kidney Beans Eggs Yogurt +Milk Apple Kidney Beans Eggs +Milk Unicorn Corn Kidney Beans Yogurt +Corn Onion Onion Kidney Beans Ice cream Eggs |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/mba_out_str.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_out_str.tabular Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -0,0 +1,7 @@ +antecedents consequents antecedent support consequent support support confidence lift leverage conviction +['Eggs'] ['Kidney Beans', 'Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Eggs'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Eggs', 'Kidney Beans'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Kidney Beans', 'Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['Onion'] ['Eggs', 'Kidney Beans'] 0.6 0.8 0.6 1.0 1.25 0.12 inf |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/mba_output_int.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_output_int.tabular Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -0,0 +1,7 @@ +antecedents consequents antecedent support consequent support support confidence lift leverage conviction +['11'] ['13', '14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['11'] ['14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['11', '13'] ['14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['13', '14'] ['11'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['14'] ['11'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['14'] ['11', '13'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece test-data/mba_output_str.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_output_str.tabular Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -0,0 +1,7 @@ +antecedents consequents antecedent support consequent support support confidence lift leverage conviction +['Eggs'] ['Kidney Beans', 'Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Eggs'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Eggs', 'Kidney Beans'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Kidney Beans', 'Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['Onion'] ['Eggs', 'Kidney Beans'] 0.6 0.8 0.6 1.0 1.25 0.12 inf |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece to_categorical.py --- a/to_categorical.py Tue Apr 13 22:08:10 2021 +0000 +++ b/to_categorical.py Sat May 01 00:48:46 2021 +0000 |
b |
@@ -43,7 +43,9 @@ aparser = argparse.ArgumentParser() aparser.add_argument("-i", "--inputs", dest="inputs", required=True) aparser.add_argument("-y", "--infile", dest="infile") - aparser.add_argument("-n", "--num_classes", dest="num_classes", type=int, default=None) + aparser.add_argument( + "-n", "--num_classes", dest="num_classes", type=int, default=None + ) aparser.add_argument("-o", "--outfile", dest="outfile") args = aparser.parse_args() |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece train_test_eval.py --- a/train_test_eval.py Tue Apr 13 22:08:10 2021 +0000 +++ b/train_test_eval.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -9,14 +9,8 @@ import numpy as np import pandas as pd from galaxy_ml.model_validations import train_test_split -from galaxy_ml.utils import ( - get_module, - get_scoring, - load_model, - read_columns, - SafeEval, - try_get_attr, -) +from galaxy_ml.utils import (get_module, get_scoring, load_model, + read_columns, SafeEval, try_get_attr) from scipy.io import mmread from sklearn import pipeline from sklearn.metrics.scorer import _check_multimetric_scoring @@ -24,7 +18,6 @@ from sklearn.model_selection._validation import _score from sklearn.utils import indexable, safe_indexing - _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score") setattr(_search, "_fit_and_score", _fit_and_score) setattr(_validation, "_fit_and_score", _fit_and_score) @@ -262,12 +255,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() if input_type == "refseq_and_interval": @@ -299,12 +289,14 @@ if df_key in loaded_df: groups = loaded_df[df_key] - groups = read_columns(groups, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + groups = read_columns( + groups, + c=c, + c_option=column_option, + sep="\t", + header=header, + parse_dates=True, + ) groups = groups.ravel() # del loaded_df @@ -371,9 +363,14 @@ "Stratified shuffle split is not " "applicable on empty target values!" ) - X_train, X_test, y_train, y_test, groups_train, _groups_test = train_test_split_none( - X, y, groups, **test_split_options - ) + ( + X_train, + X_test, + y_train, + y_test, + groups_train, + _groups_test, + ) = train_test_split_none(X, y, groups, **test_split_options) exp_scheme = params["experiment_schemes"]["selected_exp_scheme"] |
b |
diff -r e7f047a9dca9 -r 73e7f1c76ece train_test_split.py --- a/train_test_split.py Tue Apr 13 22:08:10 2021 +0000 +++ b/train_test_split.py Sat May 01 00:48:46 2021 +0000 |
[ |
@@ -28,17 +28,23 @@ # read groups if infile_groups: - header = "infer" if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) else None - column_option = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"][ - "selected_column_selector_option_g" - ] + header = ( + "infer" + if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) + else None + ) + column_option = params["mode_selection"]["cv_selector"]["groups_selector"][ + "column_selector_options_g" + ]["selected_column_selector_option_g"] if column_option in [ "by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name", ]: - c = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"] + c = params["mode_selection"]["cv_selector"]["groups_selector"][ + "column_selector_options_g" + ]["col_g"] else: c = None @@ -67,7 +73,10 @@ total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups) if nth_split > total_n_splits: - raise ValueError("Total number of splits is {}, but got `nth_split` " "= {}".format(total_n_splits, nth_split)) + raise ValueError( + "Total number of splits is {}, but got `nth_split` " + "= {}".format(total_n_splits, nth_split) + ) i = 1 for train_index, test_index in splitter.split(array.values, y=y, groups=groups): @@ -137,7 +146,9 @@ # cv splitter else: - train, test = _get_single_cv_split(params, array, infile_labels=infile_labels, infile_groups=infile_groups) + train, test = _get_single_cv_split( + params, array, infile_labels=infile_labels, infile_groups=infile_groups + ) print("Input shape: %s" % repr(array.shape)) print("Train shape: %s" % repr(train.shape)) |