comparison utils.py @ 12:8362c6cda4ef draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit d00173591e4a783a4c1cb2664e4bb192ab5414f7
author bgruening
date Fri, 17 Aug 2018 12:29:29 -0400
parents
children 1e02b574f5c0
comparison
equal deleted inserted replaced
11:cef58dfb42c3 12:8362c6cda4ef
1 import sys
2 import os
3 import pandas
4 import re
5 import pickle
6 import warnings
7 import numpy as np
8 import xgboost
9 import scipy
10 import sklearn
11 import ast
12 from asteval import Interpreter, make_symbol_table
13 from sklearn import metrics, model_selection, ensemble, svm, linear_model, naive_bayes, tree, neighbors
14
15 N_JOBS = int( os.environ.get('GALAXY_SLOTS', 1) )
16
17 def read_columns(f, c=None, c_option='by_index_number', return_df=False, **args):
18 data = pandas.read_csv(f, **args)
19 if c_option == 'by_index_number':
20 cols = list(map(lambda x: x - 1, c))
21 data = data.iloc[:,cols]
22 if c_option == 'all_but_by_index_number':
23 cols = list(map(lambda x: x - 1, c))
24 data.drop(data.columns[cols], axis=1, inplace=True)
25 if c_option == 'by_header_name':
26 cols = [e.strip() for e in c.split(',')]
27 data = data[cols]
28 if c_option == 'all_but_by_header_name':
29 cols = [e.strip() for e in c.split(',')]
30 data.drop(cols, axis=1, inplace=True)
31 y = data.values
32 if return_df:
33 return y, data
34 else:
35 return y
36 return y
37
38
39 ## generate an instance for one of sklearn.feature_selection classes
40 def feature_selector(inputs):
41 selector = inputs["selected_algorithm"]
42 selector = getattr(sklearn.feature_selection, selector)
43 options = inputs["options"]
44
45 if inputs['selected_algorithm'] == 'SelectFromModel':
46 if not options['threshold'] or options['threshold'] == 'None':
47 options['threshold'] = None
48 if inputs['model_inputter']['input_mode'] == 'prefitted':
49 model_file = inputs['model_inputter']['fitted_estimator']
50 with open(model_file, 'rb') as model_handler:
51 fitted_estimator = pickle.load(model_handler)
52 new_selector = selector(fitted_estimator, prefit=True, **options)
53 else:
54 estimator_json = inputs['model_inputter']["estimator_selector"]
55 estimator = get_estimator(estimator_json)
56 new_selector = selector(estimator, **options)
57
58 elif inputs['selected_algorithm'] == 'RFE':
59 estimator=get_estimator(inputs["estimator_selector"])
60 new_selector = selector(estimator, **options)
61
62 elif inputs['selected_algorithm'] == 'RFECV':
63 options['scoring'] = get_scoring(options['scoring'])
64 options['n_jobs'] = N_JOBS
65 options['cv'] = get_cv( options['cv'].strip() )
66 estimator=get_estimator(inputs["estimator_selector"])
67 new_selector = selector(estimator, **options)
68
69 elif inputs['selected_algorithm'] == "VarianceThreshold":
70 new_selector = selector(**options)
71
72 else:
73 score_func = inputs["score_func"]
74 score_func = getattr(sklearn.feature_selection, score_func)
75 new_selector = selector(score_func, **options)
76
77 return new_selector
78
79
80 def get_X_y(params, file1, file2):
81 input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"]
82 if input_type=="tabular":
83 header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None
84 column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
85 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
86 c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"]
87 else:
88 c = None
89 X = read_columns(
90 file1,
91 c = c,
92 c_option = column_option,
93 sep='\t',
94 header=header,
95 parse_dates=True
96 )
97 else:
98 X = mmread(file1)
99
100 header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None
101 column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
102 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
103 c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"]
104 else:
105 c = None
106 y = read_columns(
107 file2,
108 c = c,
109 c_option = column_option,
110 sep='\t',
111 header=header,
112 parse_dates=True
113 )
114 y=y.ravel()
115 return X, y
116
117
118 class SafeEval(Interpreter):
119
120 def __init__(self, load_scipy=False, load_numpy=False):
121
122 # File opening and other unneeded functions could be dropped
123 unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr']
124
125 # Allowed symbol table. Add more if needed.
126 new_syms = {
127 'np_arange': getattr(np, 'arange'),
128 'ensemble_ExtraTreesClassifier': getattr(ensemble, 'ExtraTreesClassifier')
129 }
130
131 syms = make_symbol_table(use_numpy=False, **new_syms)
132
133 if load_scipy:
134 scipy_distributions = scipy.stats.distributions.__dict__
135 for key in scipy_distributions.keys():
136 if isinstance(scipy_distributions[key], (scipy.stats.rv_continuous, scipy.stats.rv_discrete)):
137 syms['scipy_stats_' + key] = scipy_distributions[key]
138
139 if load_numpy:
140 from_numpy_random = ['beta', 'binomial', 'bytes', 'chisquare', 'choice', 'dirichlet', 'division',
141 'exponential', 'f', 'gamma', 'geometric', 'gumbel', 'hypergeometric',
142 'laplace', 'logistic', 'lognormal', 'logseries', 'mtrand', 'multinomial',
143 'multivariate_normal', 'negative_binomial', 'noncentral_chisquare', 'noncentral_f',
144 'normal', 'pareto', 'permutation', 'poisson', 'power', 'rand', 'randint',
145 'randn', 'random', 'random_integers', 'random_sample', 'ranf', 'rayleigh',
146 'sample', 'seed', 'set_state', 'shuffle', 'standard_cauchy', 'standard_exponential',
147 'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform',
148 'vonmises', 'wald', 'weibull', 'zipf' ]
149 for f in from_numpy_random:
150 syms['np_random_' + f] = getattr(np.random, f)
151
152 for key in unwanted:
153 syms.pop(key, None)
154
155 super(SafeEval, self).__init__( symtable=syms, use_numpy=False, minimal=False,
156 no_if=True, no_for=True, no_while=True, no_try=True,
157 no_functiondef=True, no_ifexp=True, no_listcomp=False,
158 no_augassign=False, no_assert=True, no_delete=True,
159 no_raise=True, no_print=True)
160
161
162 def get_search_params(params_builder):
163 search_params = {}
164 safe_eval = SafeEval(load_scipy=True, load_numpy=True)
165
166 for p in params_builder['param_set']:
167 search_p = p['search_param_selector']['search_p']
168 if search_p.strip() == '':
169 continue
170 param_type = p['search_param_selector']['selected_param_type']
171
172 lst = search_p.split(":")
173 assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."
174 literal = lst[1].strip()
175 ev = safe_eval(literal)
176 if param_type == "final_estimator_p":
177 search_params["estimator__" + lst[0].strip()] = ev
178 else:
179 search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev
180
181 return search_params
182
183
184 def get_estimator(estimator_json):
185 estimator_module = estimator_json['selected_module']
186 estimator_cls = estimator_json['selected_estimator']
187
188 if estimator_module == "xgboost":
189 cls = getattr(xgboost, estimator_cls)
190 else:
191 module = getattr(sklearn, estimator_module)
192 cls = getattr(module, estimator_cls)
193
194 estimator = cls()
195
196 estimator_params = estimator_json['text_params'].strip()
197 if estimator_params != "":
198 try:
199 params = safe_eval('dict(' + estimator_params + ')')
200 except ValueError:
201 sys.exit("Unsupported parameter input: `%s`" %estimator_params)
202 estimator.set_params(**params)
203 if 'n_jobs' in estimator.get_params():
204 estimator.set_params( n_jobs=N_JOBS )
205
206 return estimator
207
208
209 def get_cv(literal):
210 safe_eval = SafeEval()
211 if literal == "":
212 return None
213 if literal.isdigit():
214 return int(literal)
215 m = re.match(r'^(?P<method>\w+)\((?P<args>.*)\)$', literal)
216 if m:
217 my_class = getattr( model_selection, m.group('method') )
218 args = safe_eval( 'dict('+ m.group('args') + ')' )
219 return my_class( **args )
220 sys.exit("Unsupported CV input: %s" %literal)
221
222
223 def get_scoring(scoring_json):
224 def balanced_accuracy_score(y_true, y_pred):
225 C = metrics.confusion_matrix(y_true, y_pred)
226 with np.errstate(divide='ignore', invalid='ignore'):
227 per_class = np.diag(C) / C.sum(axis=1)
228 if np.any(np.isnan(per_class)):
229 warnings.warn('y_pred contains classes not in y_true')
230 per_class = per_class[~np.isnan(per_class)]
231 score = np.mean(per_class)
232 return score
233
234 if scoring_json['primary_scoring'] == "default":
235 return None
236
237 my_scorers = metrics.SCORERS
238 if 'balanced_accuracy' not in my_scorers:
239 my_scorers['balanced_accuracy'] = metrics.make_scorer(balanced_accuracy_score)
240
241 if scoring_json['secondary_scoring'] != 'None'\
242 and scoring_json['secondary_scoring'] != scoring_json['primary_scoring']:
243 scoring = {}
244 scoring['primary'] = my_scorers[ scoring_json['primary_scoring'] ]
245 for scorer in scoring_json['secondary_scoring'].split(','):
246 if scorer != scoring_json['primary_scoring']:
247 scoring[scorer] = my_scorers[scorer]
248 return scoring
249
250 return my_scorers[ scoring_json['primary_scoring'] ]
251