Mercurial > repos > bgruening > sklearn_ensemble
changeset 14:84724d805bfa draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 7c2fd140e89605fe689c39e21d70a400545e38cf
author | bgruening |
---|---|
date | Tue, 10 Jul 2018 03:11:34 -0400 |
parents | 6352834b1c99 |
children | f02eeabab5d1 |
files | ensemble.xml test-data/gbc_result01 test-data/rfr_result01 |
diffstat | 3 files changed, 47 insertions(+), 34 deletions(-) [+] |
line wrap: on
line diff
--- a/ensemble.xml Mon Jul 09 14:32:15 2018 -0400 +++ b/ensemble.xml Tue Jul 10 03:11:34 2018 -0400 @@ -25,45 +25,58 @@ @COLUMNS_FUNCTION@ @GET_X_y_FUNCTION@ +# Get inputs, outputs. input_json_path = sys.argv[1] params = json.load(open(input_json_path, "r")) +print params +# Put all cheetah up here to avoid confusion. #if $selected_tasks.selected_task == "train": +infile1 = "$selected_tasks.selected_algorithms.input_options.infile1" +infile2 = "$selected_tasks.selected_algorithms.input_options.infile2" +#else: +infile_model = "$selected_tasks.infile_model" +infile_data = "$selected_tasks.infile_data" +#end if +outfile_fit = "$outfile_fit" +outfile_predict = "$outfile_predict" + +# All Python from here on out: -algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"] -options = params["selected_tasks"]["selected_algorithms"]["options"] -if "select_max_features" in options: - if options["select_max_features"]["max_features"] == "number_input": - options["select_max_features"]["max_features"] = options["select_max_features"]["num_max_features"] - options["select_max_features"].pop("num_max_features") - options["max_features"] = options["select_max_features"]["max_features"] - options.pop("select_max_features") -if "presort" in options: - if options["presort"] == "true": - options["presort"] = True - if options["presort"] == "false": - options["presort"] = False -if "min_samples_leaf" in options and options["min_samples_leaf"] == 1.0: - options["min_samples_leaf"] = 1 -if "min_samples_split" in options and options["min_samples_split"] > 1.0: - options["min_samples_split"] = int(options["min_samples_split"]) +if params["selected_tasks"]["selected_task"] == "train": + algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"] + options = params["selected_tasks"]["selected_algorithms"]["options"] + if "select_max_features" in options: + if options["select_max_features"]["max_features"] == "number_input": + options["select_max_features"]["max_features"] = options["select_max_features"]["num_max_features"] + options["select_max_features"].pop("num_max_features") + options["max_features"] = options["select_max_features"]["max_features"] + options.pop("select_max_features") + if "presort" in options: + if options["presort"] == "true": + options["presort"] = True + if options["presort"] == "false": + options["presort"] = False + if "min_samples_leaf" in options and options["min_samples_leaf"] == 1.0: + options["min_samples_leaf"] = 1 + if "min_samples_split" in options and options["min_samples_split"] > 1.0: + options["min_samples_split"] = int(options["min_samples_split"]) -X, y = get_X_y(params, "$selected_tasks.selected_algorithms.input_options.infile1" ,"$selected_tasks.selected_algorithms.input_options.infile2") - -my_class = getattr(sklearn.ensemble, algorithm) -estimator = my_class(**options) -estimator.fit(X,y) -pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) + X, y = get_X_y(params, infile1, infile2) + + my_class = getattr(sklearn.ensemble, algorithm) + estimator = my_class(**options) + estimator.fit(X,y) + pickle.dump(estimator,open(outfile_fit, 'w+'), pickle.HIGHEST_PROTOCOL) -#else: -classifier_object = pickle.load(open("$selected_tasks.infile_model", 'r')) -header = 'infer' if params["selected_tasks"]["header"] else None -data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=header, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False) -prediction = classifier_object.predict(data) -prediction_df = pandas.DataFrame(prediction) -res = pandas.concat([data, prediction_df], axis=1) -res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False) -#end if +else: + classifier_object = pickle.load(open(infile_model, 'r')) + header = 'infer' if params["selected_tasks"]["header"] else None + data = pandas.read_csv(infile_data, sep='\t', header=header, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False) + prediction = classifier_object.predict(data) + prediction_df = pandas.DataFrame(prediction, columns=["predicted"]) + res = pandas.concat([data, prediction_df], axis=1) + res.to_csv(path_or_buf = outfile_predict, sep="\t", index=False) ]]> </configfile>
--- a/test-data/gbc_result01 Mon Jul 09 14:32:15 2018 -0400 +++ b/test-data/gbc_result01 Tue Jul 10 03:11:34 2018 -0400 @@ -1,4 +1,4 @@ -0 1 2 3 0 +0 1 2 3 predicted 3.68258022948 2.82110345641 -3.990140724 -1.9523364774 1 0.015942057224 -0.711958594347 0.125502976978 -0.972218263337 0 2.08690768825 0.929399321468 -2.12924084484 -1.99714022188 1
--- a/test-data/rfr_result01 Mon Jul 09 14:32:15 2018 -0400 +++ b/test-data/rfr_result01 Tue Jul 10 03:11:34 2018 -0400 @@ -1,4 +1,4 @@ -86.9702122735 1.00532111569 -1.01739601979 -0.613139481654 0.641846874331 0 +86.9702122735 1.00532111569 -1.01739601979 -0.613139481654 0.641846874331 predicted 91.2021798817 -0.6215229712070001 1.11914889596 0.390012184498 1.28956938152 0.8511213285107001 -47.4101632272 -0.638416457964 -0.7327774684530001 -0.8640261049779999 -1.06109770116 0.05344095304070007 61.712804630200004 -1.0999480057700002 -0.739679672932 0.585657963012 1.4890682753600002 1.1892759745694002