changeset 14:84724d805bfa draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 7c2fd140e89605fe689c39e21d70a400545e38cf
author bgruening
date Tue, 10 Jul 2018 03:11:34 -0400
parents 6352834b1c99
children f02eeabab5d1
files ensemble.xml test-data/gbc_result01 test-data/rfr_result01
diffstat 3 files changed, 47 insertions(+), 34 deletions(-) [+]
line wrap: on
line diff
--- a/ensemble.xml	Mon Jul 09 14:32:15 2018 -0400
+++ b/ensemble.xml	Tue Jul 10 03:11:34 2018 -0400
@@ -25,45 +25,58 @@
 @COLUMNS_FUNCTION@
 @GET_X_y_FUNCTION@
 
+# Get inputs, outputs.
 input_json_path = sys.argv[1]
 params = json.load(open(input_json_path, "r"))
+print params
 
+# Put all cheetah up here to avoid confusion.
 #if $selected_tasks.selected_task == "train":
+infile1 = "$selected_tasks.selected_algorithms.input_options.infile1"
+infile2 = "$selected_tasks.selected_algorithms.input_options.infile2"
+#else:
+infile_model = "$selected_tasks.infile_model"
+infile_data = "$selected_tasks.infile_data"
+#end if
+outfile_fit = "$outfile_fit"
+outfile_predict = "$outfile_predict"
+
+# All Python from here on out:
 
-algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"]
-options = params["selected_tasks"]["selected_algorithms"]["options"]
-if "select_max_features" in options:
-    if options["select_max_features"]["max_features"] == "number_input":
-        options["select_max_features"]["max_features"] = options["select_max_features"]["num_max_features"]
-        options["select_max_features"].pop("num_max_features")
-    options["max_features"] = options["select_max_features"]["max_features"]
-    options.pop("select_max_features")
-if "presort" in options:
-    if options["presort"] == "true":
-        options["presort"] = True
-    if options["presort"] == "false":
-        options["presort"] = False
-if "min_samples_leaf" in options and options["min_samples_leaf"] == 1.0:
-    options["min_samples_leaf"] = 1
-if "min_samples_split" in options and options["min_samples_split"] > 1.0:
-    options["min_samples_split"] = int(options["min_samples_split"])
+if params["selected_tasks"]["selected_task"] == "train":
+    algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"]
+    options = params["selected_tasks"]["selected_algorithms"]["options"]
+    if "select_max_features" in options:
+        if options["select_max_features"]["max_features"] == "number_input":
+            options["select_max_features"]["max_features"] = options["select_max_features"]["num_max_features"]
+            options["select_max_features"].pop("num_max_features")
+        options["max_features"] = options["select_max_features"]["max_features"]
+        options.pop("select_max_features")
+    if "presort" in options:
+        if options["presort"] == "true":
+            options["presort"] = True
+        if options["presort"] == "false":
+            options["presort"] = False
+    if "min_samples_leaf" in options and options["min_samples_leaf"] == 1.0:
+        options["min_samples_leaf"] = 1
+    if "min_samples_split" in options and options["min_samples_split"] > 1.0:
+        options["min_samples_split"] = int(options["min_samples_split"])
 
-X, y = get_X_y(params, "$selected_tasks.selected_algorithms.input_options.infile1" ,"$selected_tasks.selected_algorithms.input_options.infile2")
-
-my_class = getattr(sklearn.ensemble, algorithm)
-estimator = my_class(**options)
-estimator.fit(X,y)
-pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL)
+    X, y = get_X_y(params, infile1, infile2)
+                   
+    my_class = getattr(sklearn.ensemble, algorithm)
+    estimator = my_class(**options)
+    estimator.fit(X,y)
+    pickle.dump(estimator,open(outfile_fit, 'w+'), pickle.HIGHEST_PROTOCOL)
 
-#else:
-classifier_object = pickle.load(open("$selected_tasks.infile_model", 'r'))
-header = 'infer' if params["selected_tasks"]["header"] else None
-data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=header, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False)
-prediction = classifier_object.predict(data)
-prediction_df = pandas.DataFrame(prediction)
-res = pandas.concat([data, prediction_df], axis=1)
-res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False)
-#end if
+else:
+    classifier_object = pickle.load(open(infile_model, 'r'))
+    header = 'infer' if params["selected_tasks"]["header"] else None
+    data = pandas.read_csv(infile_data, sep='\t', header=header, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False)
+    prediction = classifier_object.predict(data)
+    prediction_df = pandas.DataFrame(prediction, columns=["predicted"])
+    res = pandas.concat([data, prediction_df], axis=1)
+    res.to_csv(path_or_buf = outfile_predict, sep="\t", index=False)
 
 ]]>
         </configfile>
--- a/test-data/gbc_result01	Mon Jul 09 14:32:15 2018 -0400
+++ b/test-data/gbc_result01	Tue Jul 10 03:11:34 2018 -0400
@@ -1,4 +1,4 @@
-0	1	2	3	0
+0	1	2	3	predicted
 3.68258022948	2.82110345641	-3.990140724	-1.9523364774	1
 0.015942057224	-0.711958594347	0.125502976978	-0.972218263337	0
 2.08690768825	0.929399321468	-2.12924084484	-1.99714022188	1
--- a/test-data/rfr_result01	Mon Jul 09 14:32:15 2018 -0400
+++ b/test-data/rfr_result01	Tue Jul 10 03:11:34 2018 -0400
@@ -1,4 +1,4 @@
-86.9702122735	1.00532111569	-1.01739601979	-0.613139481654	0.641846874331	0
+86.9702122735	1.00532111569	-1.01739601979	-0.613139481654	0.641846874331	predicted
 91.2021798817	-0.6215229712070001	1.11914889596	0.390012184498	1.28956938152	0.8511213285107001
 -47.4101632272	-0.638416457964	-0.7327774684530001	-0.8640261049779999	-1.06109770116	0.05344095304070007
 61.712804630200004	-1.0999480057700002	-0.739679672932	0.585657963012	1.4890682753600002	1.1892759745694002