# HG changeset patch # User bgruening # Date 1618336335 0 # Node ID 1e99cfb71f4057db6ec3e80edc0173d22be40d26 # Parent 7068b5fcd623db8958749d228b6eb9261e86982e "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04" diff -r 7068b5fcd623 -r 1e99cfb71f40 fitted_model_eval.py --- a/fitted_model_eval.py Thu Oct 01 20:27:36 2020 +0000 +++ b/fitted_model_eval.py Tue Apr 13 17:52:15 2021 +0000 @@ -11,7 +11,7 @@ def _get_X_y(params, infile1, infile2): - """ read from inputs and output X and y + """read from inputs and output X and y Parameters ---------- @@ -26,35 +26,40 @@ # store read dataframe object loaded_df = {} - input_type = params['input_options']['selected_input'] + input_type = params["input_options"]["selected_input"] # tabular input - if input_type == 'tabular': - header = 'infer' if params['input_options']['header1'] else None - column_option = (params['input_options']['column_selector_options_1'] - ['selected_column_selector_option']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = params['input_options']['column_selector_options_1']['col1'] + if input_type == "tabular": + header = "infer" if params["input_options"]["header1"] else None + column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = params["input_options"]["column_selector_options_1"]["col1"] else: c = None df_key = infile1 + repr(header) - df = pd.read_csv(infile1, sep='\t', header=header, - parse_dates=True) + df = pd.read_csv(infile1, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = df X = read_columns(df, c=c, c_option=column_option).astype(float) # sparse input - elif input_type == 'sparse': - X = mmread(open(infile1, 'r')) + elif input_type == "sparse": + X = mmread(open(infile1, "r")) # Get target y - header = 'infer' if params['input_options']['header2'] else None - column_option = (params['input_options']['column_selector_options_2'] - ['selected_column_selector_option2']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = params['input_options']['column_selector_options_2']['col2'] + header = "infer" if params["input_options"]["header2"] else None + column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = params["input_options"]["column_selector_options_2"]["col2"] else: c = None @@ -62,26 +67,24 @@ if df_key in loaded_df: infile2 = loaded_df[df_key] else: - infile2 = pd.read_csv(infile2, sep='\t', - header=header, parse_dates=True) + infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns( - infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() return X, y -def main(inputs, infile_estimator, outfile_eval, - infile_weights=None, infile1=None, - infile2=None): +def main( + inputs, + infile_estimator, + outfile_eval, + infile_weights=None, + infile1=None, + infile2=None, +): """ Parameter --------- @@ -103,49 +106,55 @@ infile2 : str File path to dataset containing target values """ - warnings.filterwarnings('ignore') + warnings.filterwarnings("ignore") - with open(inputs, 'r') as param_handler: + with open(inputs, "r") as param_handler: params = json.load(param_handler) X_test, y_test = _get_X_y(params, infile1, infile2) # load model - with open(infile_estimator, 'rb') as est_handler: + with open(infile_estimator, "rb") as est_handler: estimator = load_model(est_handler) main_est = estimator if isinstance(estimator, Pipeline): main_est = estimator.steps[-1][-1] - if hasattr(main_est, 'config') and hasattr(main_est, 'load_weights'): - if not infile_weights or infile_weights == 'None': - raise ValueError("The selected model skeleton asks for weights, " - "but no dataset for weights was provided!") + if hasattr(main_est, "config") and hasattr(main_est, "load_weights"): + if not infile_weights or infile_weights == "None": + raise ValueError( + "The selected model skeleton asks for weights, " "but no dataset for weights was provided!" + ) main_est.load_weights(infile_weights) # handle scorer, convert to scorer dict - scoring = params['scoring'] + # Check if scoring is specified + scoring = params["scoring"] + if scoring is not None: + # get_scoring() expects secondary_scoring to be a comma separated string (not a list) + # Check if secondary_scoring is specified + secondary_scoring = scoring.get("secondary_scoring", None) + if secondary_scoring is not None: + # If secondary_scoring is specified, convert the list into comman separated string + scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"]) + scorer = get_scoring(scoring) scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer) - if hasattr(estimator, 'evaluate'): - scores = estimator.evaluate(X_test, y_test=y_test, - scorer=scorer, - is_multimetric=True) + if hasattr(estimator, "evaluate"): + scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True) else: - scores = _score(estimator, X_test, y_test, scorer, - is_multimetric=True) + scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True) # handle output for name, score in scores.items(): scores[name] = [score] df = pd.DataFrame(scores) df = df[sorted(df.columns)] - df.to_csv(path_or_buf=outfile_eval, sep='\t', - header=True, index=False) + df.to_csv(path_or_buf=outfile_eval, sep="\t", header=True, index=False) -if __name__ == '__main__': +if __name__ == "__main__": aparser = argparse.ArgumentParser() aparser.add_argument("-i", "--inputs", dest="inputs", required=True) aparser.add_argument("-e", "--infile_estimator", dest="infile_estimator") @@ -155,6 +164,11 @@ aparser.add_argument("-O", "--outfile_eval", dest="outfile_eval") args = aparser.parse_args() - main(args.inputs, args.infile_estimator, args.outfile_eval, - infile_weights=args.infile_weights, infile1=args.infile1, - infile2=args.infile2) + main( + args.inputs, + args.infile_estimator, + args.outfile_eval, + infile_weights=args.infile_weights, + infile1=args.infile1, + infile2=args.infile2, + ) diff -r 7068b5fcd623 -r 1e99cfb71f40 keras_deep_learning.py --- a/keras_deep_learning.py Thu Oct 01 20:27:36 2020 +0000 +++ b/keras_deep_learning.py Tue Apr 13 17:52:15 2021 +0000 @@ -177,11 +177,11 @@ # merge layers if 'merging_layers' in options: idxs = literal_eval(options.pop('merging_layers')) - merging_layers = [all_layers[i-1] for i in idxs] + merging_layers = [all_layers[i - 1] for i in idxs] new_layer = klass(**options)(merging_layers) # non-input layers elif inbound_nodes is not None: - new_layer = klass(**options)(all_layers[inbound_nodes-1]) + new_layer = klass(**options)(all_layers[inbound_nodes - 1]) # input layers else: new_layer = klass(**options) @@ -189,10 +189,10 @@ all_layers.append(new_layer) input_indexes = _handle_shape(config['input_layers']) - input_layers = [all_layers[i-1] for i in input_indexes] + input_layers = [all_layers[i - 1] for i in input_indexes] output_indexes = _handle_shape(config['output_layers']) - output_layers = [all_layers[i-1] for i in output_indexes] + output_layers = [all_layers[i - 1] for i in output_indexes] return Model(inputs=input_layers, outputs=output_layers) @@ -300,8 +300,7 @@ options.update((inputs['mode_selection']['compile_params'] ['optimizer_selection']['optimizer_options'])) - train_metrics = (inputs['mode_selection']['compile_params'] - ['metrics']).split(',') + train_metrics = inputs['mode_selection']['compile_params']['metrics'] if train_metrics[-1] == 'none': train_metrics = train_metrics[:-1] options['metrics'] = train_metrics diff -r 7068b5fcd623 -r 1e99cfb71f40 keras_train_and_eval.py --- a/keras_train_and_eval.py Thu Oct 01 20:27:36 2020 +0000 +++ b/keras_train_and_eval.py Tue Apr 13 17:52:15 2021 +0000 @@ -10,7 +10,6 @@ from scipy.io import mmread from sklearn.pipeline import Pipeline from sklearn.metrics.scorer import _check_multimetric_scoring -from sklearn import model_selection from sklearn.model_selection._validation import _score from sklearn.model_selection import _search, _validation from sklearn.utils import indexable, safe_indexing @@ -18,39 +17,49 @@ from galaxy_ml.externals.selene_sdk.utils import compute_score from galaxy_ml.model_validations import train_test_split from galaxy_ml.keras_galaxy_models import _predict_generator -from galaxy_ml.utils import (SafeEval, get_scoring, load_model, - read_columns, try_get_attr, get_module, - clean_params, get_main_estimator) +from galaxy_ml.utils import ( + SafeEval, + get_scoring, + load_model, + read_columns, + try_get_attr, + get_module, + clean_params, + get_main_estimator, +) -_fit_and_score = try_get_attr('galaxy_ml.model_validations', '_fit_and_score') -setattr(_search, '_fit_and_score', _fit_and_score) -setattr(_validation, '_fit_and_score', _fit_and_score) +_fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score") +setattr(_search, "_fit_and_score", _fit_and_score) +setattr(_validation, "_fit_and_score", _fit_and_score) -N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1)) -CACHE_DIR = os.path.join(os.getcwd(), 'cached') +N_JOBS = int(os.environ.get("GALAXY_SLOTS", 1)) +CACHE_DIR = os.path.join(os.getcwd(), "cached") del os -NON_SEARCHABLE = ('n_jobs', 'pre_dispatch', 'memory', '_path', - 'nthread', 'callbacks') -ALLOWED_CALLBACKS = ('EarlyStopping', 'TerminateOnNaN', 'ReduceLROnPlateau', - 'CSVLogger', 'None') +NON_SEARCHABLE = ("n_jobs", "pre_dispatch", "memory", "_path", "nthread", "callbacks") +ALLOWED_CALLBACKS = ( + "EarlyStopping", + "TerminateOnNaN", + "ReduceLROnPlateau", + "CSVLogger", + "None", +) def _eval_swap_params(params_builder): swap_params = {} - for p in params_builder['param_set']: - swap_value = p['sp_value'].strip() - if swap_value == '': + for p in params_builder["param_set"]: + swap_value = p["sp_value"].strip() + if swap_value == "": continue - param_name = p['sp_name'] + param_name = p["sp_name"] if param_name.lower().endswith(NON_SEARCHABLE): - warnings.warn("Warning: `%s` is not eligible for search and was " - "omitted!" % param_name) + warnings.warn("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue - if not swap_value.startswith(':'): + if not swap_value.startswith(":"): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(swap_value) else: @@ -77,34 +86,31 @@ else: new_arrays.append(arr) - if kwargs['shuffle'] == 'None': - kwargs['shuffle'] = None + if kwargs["shuffle"] == "None": + kwargs["shuffle"] = None - group_names = kwargs.pop('group_names', None) + group_names = kwargs.pop("group_names", None) if group_names is not None and group_names.strip(): - group_names = [name.strip() for name in - group_names.split(',')] + group_names = [name.strip() for name in group_names.split(",")] new_arrays = indexable(*new_arrays) - groups = kwargs['labels'] + groups = kwargs["labels"] n_samples = new_arrays[0].shape[0] index_arr = np.arange(n_samples) test = index_arr[np.isin(groups, group_names)] train = index_arr[~np.isin(groups, group_names)] - rval = list(chain.from_iterable( - (safe_indexing(a, train), - safe_indexing(a, test)) for a in new_arrays)) + rval = list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays)) else: rval = train_test_split(*new_arrays, **kwargs) for pos in nones: - rval[pos * 2: 2] = [None, None] + rval[pos * 2 : 2] = [None, None] return rval def _evaluate(y_true, pred_probas, scorer, is_multimetric=True): - """ output scores based on input scorer + """output scores based on input scorer Parameters ---------- @@ -118,52 +124,55 @@ """ if y_true.ndim == 1 or y_true.shape[-1] == 1: pred_probas = pred_probas.ravel() - pred_labels = (pred_probas > 0.5).astype('int32') - targets = y_true.ravel().astype('int32') + pred_labels = (pred_probas > 0.5).astype("int32") + targets = y_true.ravel().astype("int32") if not is_multimetric: - preds = pred_labels if scorer.__class__.__name__ == \ - '_PredictScorer' else pred_probas + preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas score = scorer._score_func(targets, preds, **scorer._kwargs) return score else: scores = {} for name, one_scorer in scorer.items(): - preds = pred_labels if one_scorer.__class__.__name__\ - == '_PredictScorer' else pred_probas - score = one_scorer._score_func(targets, preds, - **one_scorer._kwargs) + preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas + score = one_scorer._score_func(targets, preds, **one_scorer._kwargs) scores[name] = score # TODO: multi-class metrics # multi-label else: - pred_labels = (pred_probas > 0.5).astype('int32') - targets = y_true.astype('int32') + pred_labels = (pred_probas > 0.5).astype("int32") + targets = y_true.astype("int32") if not is_multimetric: - preds = pred_labels if scorer.__class__.__name__ == \ - '_PredictScorer' else pred_probas - score, _ = compute_score(preds, targets, - scorer._score_func) + preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas + score, _ = compute_score(preds, targets, scorer._score_func) return score else: scores = {} for name, one_scorer in scorer.items(): - preds = pred_labels if one_scorer.__class__.__name__\ - == '_PredictScorer' else pred_probas - score, _ = compute_score(preds, targets, - one_scorer._score_func) + preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas + score, _ = compute_score(preds, targets, one_scorer._score_func) scores[name] = score return scores -def main(inputs, infile_estimator, infile1, infile2, - outfile_result, outfile_object=None, - outfile_weights=None, outfile_y_true=None, - outfile_y_preds=None, groups=None, - ref_seq=None, intervals=None, targets=None, - fasta_path=None): +def main( + inputs, + infile_estimator, + infile1, + infile2, + outfile_result, + outfile_object=None, + outfile_weights=None, + outfile_y_true=None, + outfile_y_preds=None, + groups=None, + ref_seq=None, + intervals=None, + targets=None, + fasta_path=None, +): """ Parameter --------- @@ -209,19 +218,19 @@ fasta_path : str File path to dataset containing fasta file """ - warnings.simplefilter('ignore') + warnings.simplefilter("ignore") - with open(inputs, 'r') as param_handler: + with open(inputs, "r") as param_handler: params = json.load(param_handler) # load estimator - with open(infile_estimator, 'rb') as estimator_handler: + with open(infile_estimator, "rb") as estimator_handler: estimator = load_model(estimator_handler) estimator = clean_params(estimator) # swap hyperparameter - swapping = params['experiment_schemes']['hyperparams_swapping'] + swapping = params["experiment_schemes"]["hyperparams_swapping"] swap_params = _eval_swap_params(swapping) estimator.set_params(**swap_params) @@ -230,38 +239,39 @@ # store read dataframe object loaded_df = {} - input_type = params['input_options']['selected_input'] + input_type = params["input_options"]["selected_input"] # tabular input - if input_type == 'tabular': - header = 'infer' if params['input_options']['header1'] else None - column_option = (params['input_options']['column_selector_options_1'] - ['selected_column_selector_option']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = params['input_options']['column_selector_options_1']['col1'] + if input_type == "tabular": + header = "infer" if params["input_options"]["header1"] else None + column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = params["input_options"]["column_selector_options_1"]["col1"] else: c = None df_key = infile1 + repr(header) - df = pd.read_csv(infile1, sep='\t', header=header, - parse_dates=True) + df = pd.read_csv(infile1, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = df X = read_columns(df, c=c, c_option=column_option).astype(float) # sparse input - elif input_type == 'sparse': - X = mmread(open(infile1, 'r')) + elif input_type == "sparse": + X = mmread(open(infile1, "r")) # fasta_file input - elif input_type == 'seq_fasta': - pyfaidx = get_module('pyfaidx') + elif input_type == "seq_fasta": + pyfaidx = get_module("pyfaidx") sequences = pyfaidx.Fasta(fasta_path) n_seqs = len(sequences.keys()) X = np.arange(n_seqs)[:, np.newaxis] for param in estimator_params.keys(): - if param.endswith('fasta_path'): - estimator.set_params( - **{param: fasta_path}) + if param.endswith("fasta_path"): + estimator.set_params(**{param: fasta_path}) break else: raise ValueError( @@ -270,25 +280,29 @@ "KerasGBatchClassifier with " "FastaDNABatchGenerator/FastaProteinBatchGenerator " "or having GenomeOneHotEncoder/ProteinOneHotEncoder " - "in pipeline!") + "in pipeline!" + ) - elif input_type == 'refseq_and_interval': + elif input_type == "refseq_and_interval": path_params = { - 'data_batch_generator__ref_genome_path': ref_seq, - 'data_batch_generator__intervals_path': intervals, - 'data_batch_generator__target_path': targets + "data_batch_generator__ref_genome_path": ref_seq, + "data_batch_generator__intervals_path": intervals, + "data_batch_generator__target_path": targets, } estimator.set_params(**path_params) n_intervals = sum(1 for line in open(intervals)) X = np.arange(n_intervals)[:, np.newaxis] # Get target y - header = 'infer' if params['input_options']['header2'] else None - column_option = (params['input_options']['column_selector_options_2'] - ['selected_column_selector_option2']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = params['input_options']['column_selector_options_2']['col2'] + header = "infer" if params["input_options"]["header2"] else None + column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = params["input_options"]["column_selector_options_2"]["col2"] else: c = None @@ -296,37 +310,35 @@ if df_key in loaded_df: infile2 = loaded_df[df_key] else: - infile2 = pd.read_csv(infile2, sep='\t', - header=header, parse_dates=True) + infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns( - infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns(infile2, + c=c, + c_option=column_option, + sep='\t', + header=header, + parse_dates=True) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() - if input_type == 'refseq_and_interval': - estimator.set_params( - data_batch_generator__features=y.ravel().tolist()) + if input_type == "refseq_and_interval": + estimator.set_params(data_batch_generator__features=y.ravel().tolist()) y = None # end y # load groups if groups: - groups_selector = (params['experiment_schemes']['test_split'] - ['split_algos']).pop('groups_selector') + groups_selector = (params["experiment_schemes"]["test_split"]["split_algos"]).pop("groups_selector") - header = 'infer' if groups_selector['header_g'] else None - column_option = \ - (groups_selector['column_selector_options_g'] - ['selected_column_selector_option_g']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = groups_selector['column_selector_options_g']['col_g'] + header = "infer" if groups_selector["header_g"] else None + column_option = groups_selector["column_selector_options_g"]["selected_column_selector_option_g"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = groups_selector["column_selector_options_g"]["col_g"] else: c = None @@ -334,13 +346,12 @@ if df_key in loaded_df: groups = loaded_df[df_key] - groups = read_columns( - groups, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + groups = read_columns(groups, + c=c, + c_option=column_option, + sep='\t', + header=header, + parse_dates=True) groups = groups.ravel() # del loaded_df @@ -349,86 +360,99 @@ # cache iraps_core fits could increase search speed significantly memory = joblib.Memory(location=CACHE_DIR, verbose=0) main_est = get_main_estimator(estimator) - if main_est.__class__.__name__ == 'IRAPSClassifier': + if main_est.__class__.__name__ == "IRAPSClassifier": main_est.set_params(memory=memory) # handle scorer, convert to scorer dict scoring = params['experiment_schemes']['metrics']['scoring'] + if scoring is not None: + # get_scoring() expects secondary_scoring to be a comma separated string (not a list) + # Check if secondary_scoring is specified + secondary_scoring = scoring.get("secondary_scoring", None) + if secondary_scoring is not None: + # If secondary_scoring is specified, convert the list into comman separated string + scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"]) + scorer = get_scoring(scoring) scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer) # handle test (first) split - test_split_options = (params['experiment_schemes'] - ['test_split']['split_algos']) + test_split_options = params["experiment_schemes"]["test_split"]["split_algos"] - if test_split_options['shuffle'] == 'group': - test_split_options['labels'] = groups - if test_split_options['shuffle'] == 'stratified': + if test_split_options["shuffle"] == "group": + test_split_options["labels"] = groups + if test_split_options["shuffle"] == "stratified": if y is not None: - test_split_options['labels'] = y + test_split_options["labels"] = y else: - raise ValueError("Stratified shuffle split is not " - "applicable on empty target values!") + raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") - X_train, X_test, y_train, y_test, groups_train, groups_test = \ - train_test_split_none(X, y, groups, **test_split_options) + ( + X_train, + X_test, + y_train, + y_test, + groups_train, + _groups_test, + ) = train_test_split_none(X, y, groups, **test_split_options) - exp_scheme = params['experiment_schemes']['selected_exp_scheme'] + exp_scheme = params["experiment_schemes"]["selected_exp_scheme"] # handle validation (second) split - if exp_scheme == 'train_val_test': - val_split_options = (params['experiment_schemes'] - ['val_split']['split_algos']) + if exp_scheme == "train_val_test": + val_split_options = params["experiment_schemes"]["val_split"]["split_algos"] - if val_split_options['shuffle'] == 'group': - val_split_options['labels'] = groups_train - if val_split_options['shuffle'] == 'stratified': + if val_split_options["shuffle"] == "group": + val_split_options["labels"] = groups_train + if val_split_options["shuffle"] == "stratified": if y_train is not None: - val_split_options['labels'] = y_train + val_split_options["labels"] = y_train else: - raise ValueError("Stratified shuffle split is not " - "applicable on empty target values!") + raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") - X_train, X_val, y_train, y_val, groups_train, groups_val = \ - train_test_split_none(X_train, y_train, groups_train, - **val_split_options) + ( + X_train, + X_val, + y_train, + y_val, + groups_train, + _groups_val, + ) = train_test_split_none(X_train, y_train, groups_train, **val_split_options) # train and eval - if hasattr(estimator, 'validation_data'): - if exp_scheme == 'train_val_test': - estimator.fit(X_train, y_train, - validation_data=(X_val, y_val)) + if hasattr(estimator, "validation_data"): + if exp_scheme == "train_val_test": + estimator.fit(X_train, y_train, validation_data=(X_val, y_val)) else: - estimator.fit(X_train, y_train, - validation_data=(X_test, y_test)) + estimator.fit(X_train, y_train, validation_data=(X_test, y_test)) else: estimator.fit(X_train, y_train) - if hasattr(estimator, 'evaluate'): + if hasattr(estimator, "evaluate"): steps = estimator.prediction_steps batch_size = estimator.batch_size - generator = estimator.data_generator_.flow(X_test, y=y_test, - batch_size=batch_size) - predictions, y_true = _predict_generator(estimator.model_, generator, - steps=steps) + generator = estimator.data_generator_.flow(X_test, y=y_test, batch_size=batch_size) + predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps) scores = _evaluate(y_true, predictions, scorer, is_multimetric=True) else: - if hasattr(estimator, 'predict_proba'): + if hasattr(estimator, "predict_proba"): predictions = estimator.predict_proba(X_test) else: predictions = estimator.predict(X_test) y_true = y_test - scores = _score(estimator, X_test, y_test, scorer, - is_multimetric=True) + scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True) if outfile_y_true: try: - pd.DataFrame(y_true).to_csv(outfile_y_true, sep='\t', - index=False) + pd.DataFrame(y_true).to_csv(outfile_y_true, sep="\t", index=False) pd.DataFrame(predictions).astype(np.float32).to_csv( - outfile_y_preds, sep='\t', index=False, - float_format='%g', chunksize=10000) + outfile_y_preds, + sep="\t", + index=False, + float_format="%g", + chunksize=10000, + ) except Exception as e: print("Error in saving predictions: %s" % e) @@ -437,8 +461,7 @@ scores[name] = [score] df = pd.DataFrame(scores) df = df[sorted(df.columns)] - df.to_csv(path_or_buf=outfile_result, sep='\t', - header=True, index=False) + df.to_csv(path_or_buf=outfile_result, sep="\t", header=True, index=False) memory.clear(warn=False) @@ -447,23 +470,22 @@ if isinstance(estimator, Pipeline): main_est = estimator.steps[-1][-1] - if hasattr(main_est, 'model_') \ - and hasattr(main_est, 'save_weights'): + if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"): if outfile_weights: main_est.save_weights(outfile_weights) del main_est.model_ del main_est.fit_params del main_est.model_class_ - del main_est.validation_data - if getattr(main_est, 'data_generator_', None): + if getattr(main_est, "validation_data", None): + del main_est.validation_data + if getattr(main_est, "data_generator_", None): del main_est.data_generator_ - with open(outfile_object, 'wb') as output_handler: - pickle.dump(estimator, output_handler, - pickle.HIGHEST_PROTOCOL) + with open(outfile_object, "wb") as output_handler: + pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL) -if __name__ == '__main__': +if __name__ == "__main__": aparser = argparse.ArgumentParser() aparser.add_argument("-i", "--inputs", dest="inputs", required=True) aparser.add_argument("-e", "--estimator", dest="infile_estimator") @@ -481,11 +503,19 @@ aparser.add_argument("-f", "--fasta_path", dest="fasta_path") args = aparser.parse_args() - main(args.inputs, args.infile_estimator, args.infile1, args.infile2, - args.outfile_result, outfile_object=args.outfile_object, - outfile_weights=args.outfile_weights, - outfile_y_true=args.outfile_y_true, - outfile_y_preds=args.outfile_y_preds, - groups=args.groups, - ref_seq=args.ref_seq, intervals=args.intervals, - targets=args.targets, fasta_path=args.fasta_path) + main( + args.inputs, + args.infile_estimator, + args.infile1, + args.infile2, + args.outfile_result, + outfile_object=args.outfile_object, + outfile_weights=args.outfile_weights, + outfile_y_true=args.outfile_y_true, + outfile_y_preds=args.outfile_y_preds, + groups=args.groups, + ref_seq=args.ref_seq, + intervals=args.intervals, + targets=args.targets, + fasta_path=args.fasta_path, + ) diff -r 7068b5fcd623 -r 1e99cfb71f40 main_macros.xml --- a/main_macros.xml Thu Oct 01 20:27:36 2020 +0000 +++ b/main_macros.xml Tue Apr 13 17:52:15 2021 +0000 @@ -1,1952 +1,1940 @@ - 1.0.8.2 + 1.0.8.3 - - - python - Galaxy-ML - - - + + + Galaxy-ML + + + - - - - - + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + - -
- -
-
+ +
+ +
+
- - - - - - - - - - + + + + + + + + + + - - - - - - - - - + + + + + + + + + - - - + + + - - - + + + - - - - - - - - + + + + + + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - - - - + + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - - - - - + + + + + + + - - - - - - - + + + + + + + - - - + + + - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + - - - + + + - - - + + + - - - + + + - - - - - - - + + + + + + + - - - - - - - - + + + + + + + + - - - + + + - - - + + + - - - + + + - - - - + + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - - - - + + + + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - - - - - - - - + + + + + + + + + + - - - + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+
+ + +
+ + + + + + + + + + + + + + + + + + + +
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ + + + + + + + + + + +
+
+ + +
+ + + + + + + + + +
+
- - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + +
+ + + + +
+
+ + +
+ + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - + + +
+ + + +
+
+ +
+ +
+
+
- - - - - + + +
+ + + + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - + + +
+ + +
+
+ +
+ + + +
+
+ +
+ +
+
+ +
+ + + + + + +
+
+ +
- - - - - - - - - + + + +
+
+
+ +
+ + +
+
+ +
+ + + +
+
+ +
+ + + + +
- - + +
+ + + + + + + + +
+
+ +
+ + + + + +
+
+ +
+ + + + + + + + + + + +
-
-
- - - - - - - - + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
-
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - -
- - - - - - - - - - - - - - - - - - - -
-
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - -
- - - - - - - - - - - - - - - - - - - - - - - - -
-
+ +
+ + + + + +
+
- -
- - - - - - - - - - - -
-
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + - -
- - - - - - - - - -
-
+ + + - - - - - - - - - -
- - - - -
-
+ + + + + + + + + + + + + - -
- - - - - - - - - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - + + + +
+ + + + + + + + +
+
+ + +
+ +
+
+ + +
+ +
+
+ + +
+ +
+
+ + +
+ +
+
+ + +
+ +
+
+ +
+ +
+
+
- - - - - - - - - - - + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + - - - - - - - - - - + +
+ + + +
+
- - - - - - - - - - - - - - - - - - - + + + +
+ + + +
+
+
- - -
- - - -
-
- -
- -
-
-
+ + + +
+ + + + + +
+
+
- - -
- - - - - -
-
-
+ + + +
+ + + + + + +
+
+
- - - - - - - - - - - - - - - - - - - - - + + + +
+ + + + + + + + + + + + +
+
+
- - -
- - -
-
- -
- - - -
-
- -
- -
-
- -
- - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - -
-
- -
+ - - - -
-
-
- -
- - -
-
- -
- - - -
-
- -
- - - - -
-
- -
- - - - - - - - -
-
- -
- - - - - -
-
- -
- - - - - - - - - - - -
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - -
-
- - -
- -
-
- - -
- -
-
- - -
- -
-
- - -
- -
-
- - -
- -
-
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
- - - - -
- - - -
-
-
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - -
- - - - - -
-
-
- - - - -
- - - - - - -
-
-
+ + + + + + + + + + + + + + + + + + + + + + + + - - - -
- - - - - - - + + + + + + + + + - - - - -
-
-
+ - - - - - - - - - - - - - - - - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + +
+ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - -
-
- - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - -
- - - -
-
+ +
+ + + +
+
- - - - - + + + + + + + + + + + - - - - - - - - + + - -
- - - - - - - - - -
-
+ +
+ + + + + + + + + +
+
- - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - + + + + - + - - - - selected_tasks['selected_task'] == 'load' - - - selected_tasks['selected_task'] == 'train' - - - + + + + selected_tasks['selected_task'] == 'load' + + + selected_tasks['selected_task'] == 'train' + + + - - - - 10.5281/zenodo.15094 - - + + + + 10.5281/zenodo.15094 + + - - - - @article{scikit-learn, - title={Scikit-learn: Machine Learning in {P}ython}, - author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + + + + @article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and - Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, - journal={Journal of Machine Learning Research}, - volume={12}, - pages={2825--2830}, - year={2011} + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, journal={Journal of Machine Learning Research}, volume={12}, pages={2825--2830}, year={2011} } - - - - + + + + - - - + + + @Misc{, author = {Eric Jones and Travis Oliphant and Pearu Peterson and others}, title = {{SciPy}: Open source scientific tools for {Python}}, @@ -1954,12 +1942,12 @@ url = "http://www.scipy.org/", note = {[Online; accessed 2016-04-09]} } - - - + + + - - + + @article{DBLP:journals/corr/abs-1711-08477, author = {Ryan J. Urbanowicz and Randal S. Olson and @@ -1977,11 +1965,11 @@ biburl = {https://dblp.org/rec/bib/journals/corr/abs-1711-08477}, bibsource = {dblp computer science bibliography, https://dblp.org} } - - + + - - + + @inproceedings{Chen:2016:XST:2939672.2939785, author = {Chen, Tianqi and Guestrin, Carlos}, title = {{XGBoost}: A Scalable Tree Boosting System}, @@ -1999,11 +1987,11 @@ address = {New York, NY, USA}, keywords = {large-scale machine learning}, } - - + + - - + + @article{JMLR:v18:16-365, author = {Guillaume Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas}, title = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning}, @@ -2014,22 +2002,14 @@ pages = {1-5}, url = {http://jmlr.org/papers/v18/16-365.html} } - - + + - - - @article{chen2019selene, - title={Selene: a PyTorch-based deep learning library for sequence data}, - author={Chen, Kathleen M and Cofer, Evan M and Zhou, Jian and Troyanskaya, Olga G}, - journal={Nature methods}, - volume={16}, - number={4}, - pages={315}, - year={2019}, - publisher={Nature Publishing Group} + + + @article{chen2019selene, title={Selene: a PyTorch-based deep learning library for sequence data}, author={Chen, Kathleen M and Cofer, Evan M and Zhou, Jian and Troyanskaya, Olga G}, journal={Nature methods}, volume={16}, number={4}, pages={315}, year={2019}, publisher={Nature Publishing Group} } - - + +
diff -r 7068b5fcd623 -r 1e99cfb71f40 ml_visualization_ex.py --- a/ml_visualization_ex.py Thu Oct 01 20:27:36 2020 +0000 +++ b/ml_visualization_ex.py Tue Apr 13 17:52:15 2021 +0000 @@ -22,16 +22,16 @@ # plotly default colors default_colors = [ - '#1f77b4', # muted blue - '#ff7f0e', # safety orange - '#2ca02c', # cooked asparagus green - '#d62728', # brick red - '#9467bd', # muted purple - '#8c564b', # chestnut brown - '#e377c2', # raspberry yogurt pink - '#7f7f7f', # middle gray - '#bcbd22', # curry yellow-green - '#17becf' # blue-teal + "#1f77b4", # muted blue + "#ff7f0e", # safety orange + "#2ca02c", # cooked asparagus green + "#d62728", # brick red + "#9467bd", # muted purple + "#8c564b", # chestnut brown + "#e377c2", # raspberry yogurt pink + "#7f7f7f", # middle gray + "#bcbd22", # curry yellow-green + "#17becf", # blue-teal ] @@ -52,46 +52,31 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - precision, recall, _ = precision_recall_curve( - y_true, y_score, pos_label=pos_label) - ap = average_precision_score( - y_true, y_score, pos_label=pos_label or 1) + precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label) + ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1) trace = go.Scatter( x=recall, y=precision, - mode='lines', - marker=dict( - color=default_colors[idx % len(default_colors)] - ), - name='%s (area = %.3f)' % (idx, ap) + mode="lines", + marker=dict(color=default_colors[idx % len(default_colors)]), + name="%s (area = %.3f)" % (idx, ap), ) data.append(trace) layout = go.Layout( - xaxis=dict( - title='Recall', - linecolor='lightslategray', - linewidth=1 - ), - yaxis=dict( - title='Precision', - linecolor='lightslategray', - linewidth=1 - ), + xaxis=dict(title="Recall", linecolor="lightslategray", linewidth=1), + yaxis=dict(title="Precision", linecolor="lightslategray", linewidth=1), title=dict( - text=title or 'Precision-Recall Curve', + text=title or "Precision-Recall Curve", x=0.5, y=0.92, - xanchor='center', - yanchor='top' + xanchor="center", + yanchor="top", ), - font=dict( - family="sans-serif", - size=11 - ), + font=dict(family="sans-serif", size=11), # control backgroud colors - plot_bgcolor='rgba(255,255,255,0)' + plot_bgcolor="rgba(255,255,255,0)", ) """ legend=dict( @@ -112,45 +97,47 @@ plotly.offline.plot(fig, filename="output.html", auto_open=False) # to be discovered by `from_work_dir` - os.rename('output.html', 'output') + os.rename("output.html", "output") def visualize_pr_curve_matplotlib(df1, df2, pos_label, title=None): - """visualize pr-curve using matplotlib and output svg image - """ + """visualize pr-curve using matplotlib and output svg image""" backend = matplotlib.get_backend() if "inline" not in backend: matplotlib.use("SVG") - plt.style.use('seaborn-colorblind') + plt.style.use("seaborn-colorblind") plt.figure() for idx in range(df1.shape[1]): y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - precision, recall, _ = precision_recall_curve( - y_true, y_score, pos_label=pos_label) - ap = average_precision_score( - y_true, y_score, pos_label=pos_label or 1) + precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label) + ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1) - plt.step(recall, precision, 'r-', color="black", alpha=0.3, - lw=1, where="post", label='%s (area = %.3f)' % (idx, ap)) + plt.step( + recall, + precision, + "r-", + color="black", + alpha=0.3, + lw=1, + where="post", + label="%s (area = %.3f)" % (idx, ap), + ) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) - plt.xlabel('Recall') - plt.ylabel('Precision') - title = title or 'Precision-Recall Curve' + plt.xlabel("Recall") + plt.ylabel("Precision") + title = title or "Precision-Recall Curve" plt.title(title) folder = os.getcwd() plt.savefig(os.path.join(folder, "output.svg"), format="svg") - os.rename(os.path.join(folder, "output.svg"), - os.path.join(folder, "output")) + os.rename(os.path.join(folder, "output.svg"), os.path.join(folder, "output")) -def visualize_roc_curve_plotly(df1, df2, pos_label, - drop_intermediate=True, - title=None): +def visualize_roc_curve_plotly(df1, df2, pos_label, drop_intermediate=True, title=None): """output roc-curve in html using plotly df1 : pandas.DataFrame @@ -169,45 +156,31 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, - drop_intermediate=drop_intermediate) + fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate) roc_auc = auc(fpr, tpr) trace = go.Scatter( x=fpr, y=tpr, - mode='lines', - marker=dict( - color=default_colors[idx % len(default_colors)] - ), - name='%s (area = %.3f)' % (idx, roc_auc) + mode="lines", + marker=dict(color=default_colors[idx % len(default_colors)]), + name="%s (area = %.3f)" % (idx, roc_auc), ) data.append(trace) layout = go.Layout( - xaxis=dict( - title='False Positive Rate', - linecolor='lightslategray', - linewidth=1 - ), - yaxis=dict( - title='True Positive Rate', - linecolor='lightslategray', - linewidth=1 - ), + xaxis=dict(title="False Positive Rate", linecolor="lightslategray", linewidth=1), + yaxis=dict(title="True Positive Rate", linecolor="lightslategray", linewidth=1), title=dict( - text=title or 'Receiver Operating Characteristic (ROC) Curve', + text=title or "Receiver Operating Characteristic (ROC) Curve", x=0.5, y=0.92, - xanchor='center', - yanchor='top' + xanchor="center", + yanchor="top", ), - font=dict( - family="sans-serif", - size=11 - ), + font=dict(family="sans-serif", size=11), # control backgroud colors - plot_bgcolor='rgba(255,255,255,0)' + plot_bgcolor="rgba(255,255,255,0)", ) """ # legend=dict( @@ -229,66 +202,84 @@ plotly.offline.plot(fig, filename="output.html", auto_open=False) # to be discovered by `from_work_dir` - os.rename('output.html', 'output') + os.rename("output.html", "output") -def visualize_roc_curve_matplotlib(df1, df2, pos_label, - drop_intermediate=True, - title=None): - """visualize roc-curve using matplotlib and output svg image - """ +def visualize_roc_curve_matplotlib(df1, df2, pos_label, drop_intermediate=True, title=None): + """visualize roc-curve using matplotlib and output svg image""" backend = matplotlib.get_backend() if "inline" not in backend: matplotlib.use("SVG") - plt.style.use('seaborn-colorblind') + plt.style.use("seaborn-colorblind") plt.figure() for idx in range(df1.shape[1]): y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, - drop_intermediate=drop_intermediate) + fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate) roc_auc = auc(fpr, tpr) - plt.step(fpr, tpr, 'r-', color="black", alpha=0.3, lw=1, - where="post", label='%s (area = %.3f)' % (idx, roc_auc)) + plt.step( + fpr, + tpr, + "r-", + color="black", + alpha=0.3, + lw=1, + where="post", + label="%s (area = %.3f)" % (idx, roc_auc), + ) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - title = title or 'Receiver Operating Characteristic (ROC) Curve' + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + title = title or "Receiver Operating Characteristic (ROC) Curve" plt.title(title) folder = os.getcwd() plt.savefig(os.path.join(folder, "output.svg"), format="svg") - os.rename(os.path.join(folder, "output.svg"), - os.path.join(folder, "output")) + os.rename(os.path.join(folder, "output.svg"), os.path.join(folder, "output")) def get_dataframe(file_path, plot_selection, header_name, column_name): - header = 'infer' if plot_selection[header_name] else None + header = "infer" if plot_selection[header_name] else None column_option = plot_selection[column_name]["selected_column_selector_option"] - if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: col = plot_selection[column_name]["col1"] else: col = None _, input_df = read_columns(file_path, c=col, - c_option=column_option, - return_df=True, - sep='\t', header=header, - parse_dates=True) + c_option=column_option, + return_df=True, + sep='\t', header=header, + parse_dates=True) return input_df -def main(inputs, infile_estimator=None, infile1=None, - infile2=None, outfile_result=None, - outfile_object=None, groups=None, - ref_seq=None, intervals=None, - targets=None, fasta_path=None, - model_config=None, true_labels=None, - predicted_labels=None, plot_color=None, - title=None): +def main( + inputs, + infile_estimator=None, + infile1=None, + infile2=None, + outfile_result=None, + outfile_object=None, + groups=None, + ref_seq=None, + intervals=None, + targets=None, + fasta_path=None, + model_config=None, + true_labels=None, + predicted_labels=None, + plot_color=None, + title=None, +): """ Parameter --------- @@ -341,34 +332,39 @@ title : str, default is None Title of the confusion matrix heatmap """ - warnings.simplefilter('ignore') + warnings.simplefilter("ignore") - with open(inputs, 'r') as param_handler: + with open(inputs, "r") as param_handler: params = json.load(param_handler) - title = params['plotting_selection']['title'].strip() - plot_type = params['plotting_selection']['plot_type'] - plot_format = params['plotting_selection']['plot_format'] + title = params["plotting_selection"]["title"].strip() + plot_type = params["plotting_selection"]["plot_type"] + plot_format = params["plotting_selection"]["plot_format"] - if plot_type == 'feature_importances': - with open(infile_estimator, 'rb') as estimator_handler: + if plot_type == "feature_importances": + with open(infile_estimator, "rb") as estimator_handler: estimator = load_model(estimator_handler) - column_option = (params['plotting_selection'] - ['column_selector_options'] - ['selected_column_selector_option']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = (params['plotting_selection'] - ['column_selector_options']['col1']) + column_option = params["plotting_selection"]["column_selector_options"]["selected_column_selector_option"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = params["plotting_selection"]["column_selector_options"]["col1"] else: c = None - _, input_df = read_columns(infile1, c=c, - c_option=column_option, - return_df=True, - sep='\t', header='infer', - parse_dates=True) + _, input_df = read_columns( + infile1, + c=c, + c_option=column_option, + return_df=True, + sep="\t", + header="infer", + parse_dates=True, + ) feature_names = input_df.columns.values @@ -379,16 +375,14 @@ feature_names = feature_names[mask] estimator = estimator.steps[-1][-1] - if hasattr(estimator, 'coef_'): + if hasattr(estimator, "coef_"): coefs = estimator.coef_ else: - coefs = getattr(estimator, 'feature_importances_', None) + coefs = getattr(estimator, "feature_importances_", None) if coefs is None: - raise RuntimeError('The classifier does not expose ' - '"coef_" or "feature_importances_" ' - 'attributes') + raise RuntimeError("The classifier does not expose " '"coef_" or "feature_importances_" ' "attributes") - threshold = params['plotting_selection']['threshold'] + threshold = params["plotting_selection"]["threshold"] if threshold is not None: mask = (coefs > threshold) | (coefs < -threshold) coefs = coefs[mask] @@ -397,80 +391,74 @@ # sort indices = np.argsort(coefs)[::-1] - trace = go.Bar(x=feature_names[indices], - y=coefs[indices]) + trace = go.Bar(x=feature_names[indices], y=coefs[indices]) layout = go.Layout(title=title or "Feature Importances") fig = go.Figure(data=[trace], layout=layout) - plotly.offline.plot(fig, filename="output.html", - auto_open=False) + plotly.offline.plot(fig, filename="output.html", auto_open=False) # to be discovered by `from_work_dir` - os.rename('output.html', 'output') + os.rename("output.html", "output") return 0 - elif plot_type in ('pr_curve', 'roc_curve'): - df1 = pd.read_csv(infile1, sep='\t', header='infer') - df2 = pd.read_csv(infile2, sep='\t', header='infer').astype(np.float32) + elif plot_type in ("pr_curve", "roc_curve"): + df1 = pd.read_csv(infile1, sep="\t", header="infer") + df2 = pd.read_csv(infile2, sep="\t", header="infer").astype(np.float32) - minimum = params['plotting_selection']['report_minimum_n_positives'] + minimum = params["plotting_selection"]["report_minimum_n_positives"] # filter out columns whose n_positives is beblow the threhold if minimum: mask = df1.sum(axis=0) >= minimum df1 = df1.loc[:, mask] df2 = df2.loc[:, mask] - pos_label = params['plotting_selection']['pos_label'].strip() \ - or None + pos_label = params["plotting_selection"]["pos_label"].strip() or None - if plot_type == 'pr_curve': - if plot_format == 'plotly_html': + if plot_type == "pr_curve": + if plot_format == "plotly_html": visualize_pr_curve_plotly(df1, df2, pos_label, title=title) else: visualize_pr_curve_matplotlib(df1, df2, pos_label, title) - else: # 'roc_curve' - drop_intermediate = (params['plotting_selection'] - ['drop_intermediate']) - if plot_format == 'plotly_html': - visualize_roc_curve_plotly(df1, df2, pos_label, - drop_intermediate=drop_intermediate, - title=title) + else: # 'roc_curve' + drop_intermediate = params["plotting_selection"]["drop_intermediate"] + if plot_format == "plotly_html": + visualize_roc_curve_plotly( + df1, + df2, + pos_label, + drop_intermediate=drop_intermediate, + title=title, + ) else: visualize_roc_curve_matplotlib( - df1, df2, pos_label, + df1, + df2, + pos_label, drop_intermediate=drop_intermediate, - title=title) + title=title, + ) return 0 - elif plot_type == 'rfecv_gridscores': - input_df = pd.read_csv(infile1, sep='\t', header='infer') + elif plot_type == "rfecv_gridscores": + input_df = pd.read_csv(infile1, sep="\t", header="infer") scores = input_df.iloc[:, 0] - steps = params['plotting_selection']['steps'].strip() + steps = params["plotting_selection"]["steps"].strip() steps = safe_eval(steps) data = go.Scatter( x=list(range(len(scores))), y=scores, text=[str(_) for _ in steps] if steps else None, - mode='lines' + mode="lines", ) layout = go.Layout( xaxis=dict(title="Number of features selected"), yaxis=dict(title="Cross validation score"), - title=dict( - text=title or None, - x=0.5, - y=0.92, - xanchor='center', - yanchor='top' - ), - font=dict( - family="sans-serif", - size=11 - ), + title=dict(text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top"), + font=dict(family="sans-serif", size=11), # control backgroud colors - plot_bgcolor='rgba(255,255,255,0)' + plot_bgcolor="rgba(255,255,255,0)", ) """ # legend=dict( @@ -489,55 +477,43 @@ """ fig = go.Figure(data=[data], layout=layout) - plotly.offline.plot(fig, filename="output.html", - auto_open=False) + plotly.offline.plot(fig, filename="output.html", auto_open=False) # to be discovered by `from_work_dir` - os.rename('output.html', 'output') + os.rename("output.html", "output") return 0 - elif plot_type == 'learning_curve': - input_df = pd.read_csv(infile1, sep='\t', header='infer') - plot_std_err = params['plotting_selection']['plot_std_err'] + elif plot_type == "learning_curve": + input_df = pd.read_csv(infile1, sep="\t", header="infer") + plot_std_err = params["plotting_selection"]["plot_std_err"] data1 = go.Scatter( - x=input_df['train_sizes_abs'], - y=input_df['mean_train_scores'], - error_y=dict( - array=input_df['std_train_scores'] - ) if plot_std_err else None, - mode='lines', + x=input_df["train_sizes_abs"], + y=input_df["mean_train_scores"], + error_y=dict(array=input_df["std_train_scores"]) if plot_std_err else None, + mode="lines", name="Train Scores", ) data2 = go.Scatter( - x=input_df['train_sizes_abs'], - y=input_df['mean_test_scores'], - error_y=dict( - array=input_df['std_test_scores'] - ) if plot_std_err else None, - mode='lines', + x=input_df["train_sizes_abs"], + y=input_df["mean_test_scores"], + error_y=dict(array=input_df["std_test_scores"]) if plot_std_err else None, + mode="lines", name="Test Scores", ) layout = dict( - xaxis=dict( - title='No. of samples' - ), - yaxis=dict( - title='Performance Score' - ), + xaxis=dict(title="No. of samples"), + yaxis=dict(title="Performance Score"), # modify these configurations to customize image title=dict( - text=title or 'Learning Curve', + text=title or "Learning Curve", x=0.5, y=0.92, - xanchor='center', - yanchor='top' + xanchor="center", + yanchor="top", ), - font=dict( - family="sans-serif", - size=11 - ), + font=dict(family="sans-serif", size=11), # control backgroud colors - plot_bgcolor='rgba(255,255,255,0)' + plot_bgcolor="rgba(255,255,255,0)", ) """ # legend=dict( @@ -556,27 +532,26 @@ """ fig = go.Figure(data=[data1, data2], layout=layout) - plotly.offline.plot(fig, filename="output.html", - auto_open=False) + plotly.offline.plot(fig, filename="output.html", auto_open=False) # to be discovered by `from_work_dir` - os.rename('output.html', 'output') + os.rename("output.html", "output") return 0 - elif plot_type == 'keras_plot_model': - with open(model_config, 'r') as f: + elif plot_type == "keras_plot_model": + with open(model_config, "r") as f: model_str = f.read() model = model_from_json(model_str) plot_model(model, to_file="output.png") - os.rename('output.png', 'output') + os.rename("output.png", "output") return 0 - elif plot_type == 'classification_confusion_matrix': + elif plot_type == "classification_confusion_matrix": plot_selection = params["plotting_selection"] input_true = get_dataframe(true_labels, plot_selection, "header_true", "column_selector_options_true") - header_predicted = 'infer' if plot_selection["header_predicted"] else None - input_predicted = pd.read_csv(predicted_labels, sep='\t', parse_dates=True, header=header_predicted) + header_predicted = "infer" if plot_selection["header_predicted"] else None + input_predicted = pd.read_csv(predicted_labels, sep="\t", parse_dates=True, header=header_predicted) true_classes = input_true.iloc[:, -1].copy() predicted_classes = input_predicted.iloc[:, -1].copy() axis_labels = list(set(true_classes)) @@ -586,15 +561,15 @@ for i in range(len(c_matrix)): for j in range(len(c_matrix)): ax.text(j, i, c_matrix[i, j], ha="center", va="center", color="k") - ax.set_ylabel('True class labels') - ax.set_xlabel('Predicted class labels') + ax.set_ylabel("True class labels") + ax.set_xlabel("Predicted class labels") ax.set_title(title) ax.set_xticks(axis_labels) ax.set_yticks(axis_labels) fig.colorbar(im, ax=ax) fig.tight_layout() plt.savefig("output.png", dpi=125) - os.rename('output.png', 'output') + os.rename("output.png", "output") return 0 @@ -603,7 +578,7 @@ # fig.write_image("image.pdf", format='pdf', width=340*2, height=226*2) -if __name__ == '__main__': +if __name__ == "__main__": aparser = argparse.ArgumentParser() aparser.add_argument("-i", "--inputs", dest="inputs", required=True) aparser.add_argument("-e", "--estimator", dest="infile_estimator") @@ -623,11 +598,21 @@ aparser.add_argument("-pt", "--title", dest="title") args = aparser.parse_args() - main(args.inputs, args.infile_estimator, args.infile1, args.infile2, - args.outfile_result, outfile_object=args.outfile_object, - groups=args.groups, ref_seq=args.ref_seq, intervals=args.intervals, - targets=args.targets, fasta_path=args.fasta_path, - model_config=args.model_config, true_labels=args.true_labels, - predicted_labels=args.predicted_labels, - plot_color=args.plot_color, - title=args.title) + main( + args.inputs, + args.infile_estimator, + args.infile1, + args.infile2, + args.outfile_result, + outfile_object=args.outfile_object, + groups=args.groups, + ref_seq=args.ref_seq, + intervals=args.intervals, + targets=args.targets, + fasta_path=args.fasta_path, + model_config=args.model_config, + true_labels=args.true_labels, + predicted_labels=args.predicted_labels, + plot_color=args.plot_color, + title=args.title, + ) diff -r 7068b5fcd623 -r 1e99cfb71f40 model_prediction.py --- a/model_prediction.py Thu Oct 01 20:27:36 2020 +0000 +++ b/model_prediction.py Tue Apr 13 17:52:15 2021 +0000 @@ -1,23 +1,29 @@ import argparse import json +import warnings + import numpy as np import pandas as pd -import warnings - from scipy.io import mmread from sklearn.pipeline import Pipeline -from galaxy_ml.utils import (load_model, read_columns, - get_module, try_get_attr) +from galaxy_ml.utils import (get_module, load_model, + read_columns, try_get_attr) + + +N_JOBS = int(__import__("os").environ.get("GALAXY_SLOTS", 1)) -N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) - - -def main(inputs, infile_estimator, outfile_predict, - infile_weights=None, infile1=None, - fasta_path=None, ref_seq=None, - vcf_path=None): +def main( + inputs, + infile_estimator, + outfile_predict, + infile_weights=None, + infile1=None, + fasta_path=None, + ref_seq=None, + vcf_path=None, +): """ Parameter --------- @@ -45,96 +51,94 @@ vcf_path : str File path to dataset containing variants info. """ - warnings.filterwarnings('ignore') + warnings.filterwarnings("ignore") - with open(inputs, 'r') as param_handler: + with open(inputs, "r") as param_handler: params = json.load(param_handler) # load model - with open(infile_estimator, 'rb') as est_handler: + with open(infile_estimator, "rb") as est_handler: estimator = load_model(est_handler) main_est = estimator if isinstance(estimator, Pipeline): main_est = estimator.steps[-1][-1] - if hasattr(main_est, 'config') and hasattr(main_est, 'load_weights'): - if not infile_weights or infile_weights == 'None': - raise ValueError("The selected model skeleton asks for weights, " - "but dataset for weights wan not selected!") + if hasattr(main_est, "config") and hasattr(main_est, "load_weights"): + if not infile_weights or infile_weights == "None": + raise ValueError( + "The selected model skeleton asks for weights, " "but dataset for weights wan not selected!" + ) main_est.load_weights(infile_weights) # handle data input - input_type = params['input_options']['selected_input'] + input_type = params["input_options"]["selected_input"] # tabular input - if input_type == 'tabular': - header = 'infer' if params['input_options']['header1'] else None - column_option = (params['input_options'] - ['column_selector_options_1'] - ['selected_column_selector_option']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = params['input_options']['column_selector_options_1']['col1'] + if input_type == "tabular": + header = "infer" if params["input_options"]["header1"] else None + column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = params["input_options"]["column_selector_options_1"]["col1"] else: c = None - df = pd.read_csv(infile1, sep='\t', header=header, parse_dates=True) + df = pd.read_csv(infile1, sep="\t", header=header, parse_dates=True) X = read_columns(df, c=c, c_option=column_option).astype(float) - if params['method'] == 'predict': + if params["method"] == "predict": preds = estimator.predict(X) else: preds = estimator.predict_proba(X) # sparse input - elif input_type == 'sparse': - X = mmread(open(infile1, 'r')) - if params['method'] == 'predict': + elif input_type == "sparse": + X = mmread(open(infile1, "r")) + if params["method"] == "predict": preds = estimator.predict(X) else: preds = estimator.predict_proba(X) # fasta input - elif input_type == 'seq_fasta': - if not hasattr(estimator, 'data_batch_generator'): + elif input_type == "seq_fasta": + if not hasattr(estimator, "data_batch_generator"): raise ValueError( "To do prediction on sequences in fasta input, " "the estimator must be a `KerasGBatchClassifier`" - "equipped with data_batch_generator!") - pyfaidx = get_module('pyfaidx') + "equipped with data_batch_generator!" + ) + pyfaidx = get_module("pyfaidx") sequences = pyfaidx.Fasta(fasta_path) n_seqs = len(sequences.keys()) X = np.arange(n_seqs)[:, np.newaxis] seq_length = estimator.data_batch_generator.seq_length - batch_size = getattr(estimator, 'batch_size', 32) + batch_size = getattr(estimator, "batch_size", 32) steps = (n_seqs + batch_size - 1) // batch_size - seq_type = params['input_options']['seq_type'] - klass = try_get_attr( - 'galaxy_ml.preprocessors', seq_type) + seq_type = params["input_options"]["seq_type"] + klass = try_get_attr("galaxy_ml.preprocessors", seq_type) - pred_data_generator = klass( - fasta_path, seq_length=seq_length) + pred_data_generator = klass(fasta_path, seq_length=seq_length) - if params['method'] == 'predict': - preds = estimator.predict( - X, data_generator=pred_data_generator, steps=steps) + if params["method"] == "predict": + preds = estimator.predict(X, data_generator=pred_data_generator, steps=steps) else: - preds = estimator.predict_proba( - X, data_generator=pred_data_generator, steps=steps) + preds = estimator.predict_proba(X, data_generator=pred_data_generator, steps=steps) # vcf input - elif input_type == 'variant_effect': - klass = try_get_attr('galaxy_ml.preprocessors', - 'GenomicVariantBatchGenerator') + elif input_type == "variant_effect": + klass = try_get_attr("galaxy_ml.preprocessors", "GenomicVariantBatchGenerator") - options = params['input_options'] - options.pop('selected_input') - if options['blacklist_regions'] == 'none': - options['blacklist_regions'] = None + options = params["input_options"] + options.pop("selected_input") + if options["blacklist_regions"] == "none": + options["blacklist_regions"] = None - pred_data_generator = klass( - ref_genome_path=ref_seq, vcf_path=vcf_path, **options) + pred_data_generator = klass(ref_genome_path=ref_seq, vcf_path=vcf_path, **options) pred_data_generator.set_processing_attrs() @@ -143,9 +147,8 @@ # predict 1600 sample at once then write to file gen_flow = pred_data_generator.flow(batch_size=1600) - file_writer = open(outfile_predict, 'w') - header_row = '\t'.join(['chrom', 'pos', 'name', 'ref', - 'alt', 'strand']) + file_writer = open(outfile_predict, "w") + header_row = "\t".join(["chrom", "pos", "name", "ref", "alt", "strand"]) file_writer.write(header_row) header_done = False @@ -155,23 +158,24 @@ try: while steps_done < len(gen_flow): index_array = next(gen_flow.index_generator) - batch_X = gen_flow._get_batches_of_transformed_samples( - index_array) + batch_X = gen_flow._get_batches_of_transformed_samples(index_array) - if params['method'] == 'predict': + if params["method"] == "predict": batch_preds = estimator.predict( batch_X, # The presence of `pred_data_generator` below is to # override model carrying data_generator if there # is any. - data_generator=pred_data_generator) + data_generator=pred_data_generator, + ) else: batch_preds = estimator.predict_proba( batch_X, # The presence of `pred_data_generator` below is to # override model carrying data_generator if there # is any. - data_generator=pred_data_generator) + data_generator=pred_data_generator, + ) if batch_preds.ndim == 1: batch_preds = batch_preds[:, np.newaxis] @@ -181,12 +185,12 @@ if not header_done: heads = np.arange(batch_preds.shape[-1]).astype(str) - heads_str = '\t'.join(heads) + heads_str = "\t".join(heads) file_writer.write("\t%s\n" % heads_str) header_done = True for row in batch_out: - row_str = '\t'.join(row) + row_str = "\t".join(row) file_writer.write("%s\n" % row_str) steps_done += 1 @@ -200,14 +204,14 @@ # output if len(preds.shape) == 1: - rval = pd.DataFrame(preds, columns=['Predicted']) + rval = pd.DataFrame(preds, columns=["Predicted"]) else: rval = pd.DataFrame(preds) - rval.to_csv(outfile_predict, sep='\t', header=True, index=False) + rval.to_csv(outfile_predict, sep="\t", header=True, index=False) -if __name__ == '__main__': +if __name__ == "__main__": aparser = argparse.ArgumentParser() aparser.add_argument("-i", "--inputs", dest="inputs", required=True) aparser.add_argument("-e", "--infile_estimator", dest="infile_estimator") @@ -219,7 +223,13 @@ aparser.add_argument("-v", "--vcf_path", dest="vcf_path") args = aparser.parse_args() - main(args.inputs, args.infile_estimator, args.outfile_predict, - infile_weights=args.infile_weights, infile1=args.infile1, - fasta_path=args.fasta_path, ref_seq=args.ref_seq, - vcf_path=args.vcf_path) + main( + args.inputs, + args.infile_estimator, + args.outfile_predict, + infile_weights=args.infile_weights, + infile1=args.infile1, + fasta_path=args.fasta_path, + ref_seq=args.ref_seq, + vcf_path=args.vcf_path, + ) diff -r 7068b5fcd623 -r 1e99cfb71f40 pca.py --- a/pca.py Thu Oct 01 20:27:36 2020 +0000 +++ b/pca.py Tue Apr 13 17:52:15 2021 +0000 @@ -1,98 +1,185 @@ import argparse + import numpy as np -from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA from galaxy_ml.utils import read_columns +from sklearn.decomposition import IncrementalPCA, KernelPCA, PCA + def main(): - parser = argparse.ArgumentParser(description='RDKit screen') - parser.add_argument('-i', '--infile', - help="Input file") - parser.add_argument('--header', action='store_true', help="Include the header row or skip it") - parser.add_argument('-c', '--columns', type=str.lower, default='all', choices=['by_index_number', 'all_but_by_index_number',\ - 'by_header_name', 'all_but_by_header_name', 'all_columns'], - help="Choose to select all columns, or exclude/include some") - parser.add_argument('-ci', '--column_indices', type=str.lower, - help="Choose to select all columns, or exclude/include some") - parser.add_argument('-n', '--number', nargs='?', type=int, default=None,\ - help="Number of components to keep. If not set, all components are kept") - parser.add_argument('--whiten', action='store_true', help="Whiten the components") - parser.add_argument('-t', '--pca_type', type=str.lower, default='classical', choices=['classical', 'incremental', 'kernel'], - help="Choose which flavour of PCA to use") - parser.add_argument('-s', '--svd_solver', type=str.lower, default='auto', choices=['auto', 'full', 'arpack', 'randomized'], - help="Choose the type of svd solver.") - parser.add_argument('-b', '--batch_size', nargs='?', type=int, default=None,\ - help="The number of samples to use for each batch") - parser.add_argument('-k', '--kernel', type=str.lower, default='linear',\ - choices=['linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'], - help="Choose the type of kernel.") - parser.add_argument('-g', '--gamma', nargs='?', type=float, default=None, - help='Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other kernels') - parser.add_argument('-tol', '--tolerance', type=float, default=0.0, - help='Convergence tolerance for arpack. If 0, optimal value will be chosen by arpack') - parser.add_argument('-mi', '--max_iter', nargs='?', type=int, default=None,\ - help="Maximum number of iterations for arpack") - parser.add_argument('-d', '--degree', type=int, default=3,\ - help="Degree for poly kernels. Ignored by other kernels") - parser.add_argument('-cf', '--coef0', type=float, default=1.0, - help='Independent term in poly and sigmoid kernels') - parser.add_argument('-e', '--eigen_solver', type=str.lower, default='auto', choices=['auto', 'dense', 'arpack'], - help="Choose the type of eigen solver.") - parser.add_argument('-o', '--outfile', - help="Base name for output file (no extension).") + parser = argparse.ArgumentParser(description="RDKit screen") + parser.add_argument("-i", "--infile", help="Input file") + parser.add_argument( + "--header", action="store_true", help="Include the header row or skip it" + ) + parser.add_argument( + "-c", + "--columns", + type=str.lower, + default="all", + choices=[ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + "all_columns", + ], + help="Choose to select all columns, or exclude/include some", + ) + parser.add_argument( + "-ci", + "--column_indices", + type=str.lower, + help="Choose to select all columns, or exclude/include some", + ) + parser.add_argument( + "-n", + "--number", + nargs="?", + type=int, + default=None, + help="Number of components to keep. If not set, all components are kept", + ) + parser.add_argument("--whiten", action="store_true", help="Whiten the components") + parser.add_argument( + "-t", + "--pca_type", + type=str.lower, + default="classical", + choices=["classical", "incremental", "kernel"], + help="Choose which flavour of PCA to use", + ) + parser.add_argument( + "-s", + "--svd_solver", + type=str.lower, + default="auto", + choices=["auto", "full", "arpack", "randomized"], + help="Choose the type of svd solver.", + ) + parser.add_argument( + "-b", + "--batch_size", + nargs="?", + type=int, + default=None, + help="The number of samples to use for each batch", + ) + parser.add_argument( + "-k", + "--kernel", + type=str.lower, + default="linear", + choices=["linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"], + help="Choose the type of kernel.", + ) + parser.add_argument( + "-g", + "--gamma", + nargs="?", + type=float, + default=None, + help="Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other kernels", + ) + parser.add_argument( + "-tol", + "--tolerance", + type=float, + default=0.0, + help="Convergence tolerance for arpack. If 0, optimal value will be chosen by arpack", + ) + parser.add_argument( + "-mi", + "--max_iter", + nargs="?", + type=int, + default=None, + help="Maximum number of iterations for arpack", + ) + parser.add_argument( + "-d", + "--degree", + type=int, + default=3, + help="Degree for poly kernels. Ignored by other kernels", + ) + parser.add_argument( + "-cf", + "--coef0", + type=float, + default=1.0, + help="Independent term in poly and sigmoid kernels", + ) + parser.add_argument( + "-e", + "--eigen_solver", + type=str.lower, + default="auto", + choices=["auto", "dense", "arpack"], + help="Choose the type of eigen solver.", + ) + parser.add_argument( + "-o", "--outfile", help="Base name for output file (no extension)." + ) args = parser.parse_args() usecols = None - cols = [] pca_params = {} - if args.columns == 'by_index_number' or args.columns == 'all_but_by_index_number': - usecols = [int(i) for i in args.column_indices.split(',')] - elif args.columns == 'by_header_name' or args.columns == 'all_but_by_header_name': + if args.columns == "by_index_number" or args.columns == "all_but_by_index_number": + usecols = [int(i) for i in args.column_indices.split(",")] + elif args.columns == "by_header_name" or args.columns == "all_but_by_header_name": usecols = args.column_indices - header = 'infer' if args.header else None + header = "infer" if args.header else None pca_input = read_columns( f=args.infile, c=usecols, c_option=args.columns, - sep='\t', + sep="\t", header=header, parse_dates=True, encoding=None, - index_col=None) + index_col=None, + ) - pca_params.update({'n_components': args.number}) + pca_params.update({"n_components": args.number}) - if args.pca_type == 'classical': - pca_params.update({'svd_solver': args.svd_solver, 'whiten': args.whiten}) - if args.svd_solver == 'arpack': - pca_params.update({'tol': args.tolerance}) + if args.pca_type == "classical": + pca_params.update({"svd_solver": args.svd_solver, "whiten": args.whiten}) + if args.svd_solver == "arpack": + pca_params.update({"tol": args.tolerance}) pca = PCA() - elif args.pca_type == 'incremental': - pca_params.update({'batch_size': args.batch_size, 'whiten': args.whiten}) + elif args.pca_type == "incremental": + pca_params.update({"batch_size": args.batch_size, "whiten": args.whiten}) pca = IncrementalPCA() - elif args.pca_type == 'kernel': - pca_params.update({'kernel': args.kernel, 'eigen_solver': args.eigen_solver, 'gamma': args.gamma}) + elif args.pca_type == "kernel": + pca_params.update( + { + "kernel": args.kernel, + "eigen_solver": args.eigen_solver, + "gamma": args.gamma, + } + ) - if args.kernel == 'poly': - pca_params.update({'degree': args.degree, 'coef0': args.coef0}) - elif args.kernel == 'sigmoid': - pca_params.update({'coef0': args.coef0}) - elif args.kernel == 'precomputed': + if args.kernel == "poly": + pca_params.update({"degree": args.degree, "coef0": args.coef0}) + elif args.kernel == "sigmoid": + pca_params.update({"coef0": args.coef0}) + elif args.kernel == "precomputed": pca_input = np.dot(pca_input, pca_input.T) - if args.eigen_solver == 'arpack': - pca_params.update({'tol': args.tolerance, 'max_iter': args.max_iter}) + if args.eigen_solver == "arpack": + pca_params.update({"tol": args.tolerance, "max_iter": args.max_iter}) pca = KernelPCA() print(pca_params) pca.set_params(**pca_params) pca_output = pca.fit_transform(pca_input) - np.savetxt(fname=args.outfile, X=pca_output, fmt='%.4f', delimiter='\t') + np.savetxt(fname=args.outfile, X=pca_output, fmt="%.4f", delimiter="\t") if __name__ == "__main__": diff -r 7068b5fcd623 -r 1e99cfb71f40 sample_generator.xml --- a/sample_generator.xml Thu Oct 01 20:27:36 2020 +0000 +++ b/sample_generator.xml Tue Apr 13 17:52:15 2021 +0000 @@ -1,10 +1,10 @@ - + random samples with controlled size and complexity main_macros.xml - - + + echo "@VERSION@"
- - - + + + - + - - + +
- - - - - - - + + + + + + + - + - - + +
- - - - - - + + + + + +
- - + +
- - - - - + + + + +
- - - - + + + +
- - - - - + + + + + - - + +
- - - + + +
- - - - + + + +
- - - + + +
- - - + + +
- - - + + +
- - - + + +