comparison feature_selectors.py @ 8:1a9d5a8fff12 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author bgruening
date Tue, 14 May 2019 18:03:50 -0400
parents
children
comparison
equal deleted inserted replaced
7:372582a7a34d 8:1a9d5a8fff12
1 """
2 DyRFE
3 DyRFECV
4 MyPipeline
5 MyimbPipeline
6 check_feature_importances
7 """
8 import numpy as np
9
10 from imblearn import under_sampling, over_sampling, combine
11 from imblearn.pipeline import Pipeline as imbPipeline
12 from sklearn import (cluster, compose, decomposition, ensemble,
13 feature_extraction, feature_selection,
14 gaussian_process, kernel_approximation,
15 metrics, model_selection, naive_bayes,
16 neighbors, pipeline, preprocessing,
17 svm, linear_model, tree, discriminant_analysis)
18
19 from sklearn.base import BaseEstimator
20 from sklearn.base import MetaEstimatorMixin, clone, is_classifier
21 from sklearn.feature_selection.rfe import _rfe_single_fit, RFE, RFECV
22 from sklearn.model_selection import check_cv
23 from sklearn.metrics.scorer import check_scoring
24 from sklearn.utils import check_X_y, safe_indexing, safe_sqr
25 from sklearn.utils._joblib import Parallel, delayed, effective_n_jobs
26
27
28 class DyRFE(RFE):
29 """
30 Mainly used with DyRFECV
31
32 Parameters
33 ----------
34 estimator : object
35 A supervised learning estimator with a ``fit`` method that provides
36 information about feature importance either through a ``coef_``
37 attribute or through a ``feature_importances_`` attribute.
38 n_features_to_select : int or None (default=None)
39 The number of features to select. If `None`, half of the features
40 are selected.
41 step : int, float or list, optional (default=1)
42 If greater than or equal to 1, then ``step`` corresponds to the
43 (integer) number of features to remove at each iteration.
44 If within (0.0, 1.0), then ``step`` corresponds to the percentage
45 (rounded down) of features to remove at each iteration.
46 If list, a series of steps of features to remove at each iteration.
47 Iterations stops when steps finish
48 verbose : int, (default=0)
49 Controls verbosity of output.
50
51 """
52 def __init__(self, estimator, n_features_to_select=None, step=1,
53 verbose=0):
54 super(DyRFE, self).__init__(estimator, n_features_to_select,
55 step, verbose)
56
57 def _fit(self, X, y, step_score=None):
58
59 if type(self.step) is not list:
60 return super(DyRFE, self)._fit(X, y, step_score)
61
62 # dynamic step
63 X, y = check_X_y(X, y, "csc")
64 # Initialization
65 n_features = X.shape[1]
66 if self.n_features_to_select is None:
67 n_features_to_select = n_features // 2
68 else:
69 n_features_to_select = self.n_features_to_select
70
71 step = []
72 for s in self.step:
73 if 0.0 < s < 1.0:
74 step.append(int(max(1, s * n_features)))
75 else:
76 step.append(int(s))
77 if s <= 0:
78 raise ValueError("Step must be >0")
79
80 support_ = np.ones(n_features, dtype=np.bool)
81 ranking_ = np.ones(n_features, dtype=np.int)
82
83 if step_score:
84 self.scores_ = []
85
86 step_i = 0
87 # Elimination
88 while np.sum(support_) > n_features_to_select and step_i < len(step):
89
90 # if last step is 1, will keep loop
91 if step_i == len(step) - 1 and step[step_i] != 0:
92 step.append(step[step_i])
93
94 # Remaining features
95 features = np.arange(n_features)[support_]
96
97 # Rank the remaining features
98 estimator = clone(self.estimator)
99 if self.verbose > 0:
100 print("Fitting estimator with %d features." % np.sum(support_))
101
102 estimator.fit(X[:, features], y)
103
104 # Get coefs
105 if hasattr(estimator, 'coef_'):
106 coefs = estimator.coef_
107 else:
108 coefs = getattr(estimator, 'feature_importances_', None)
109 if coefs is None:
110 raise RuntimeError('The classifier does not expose '
111 '"coef_" or "feature_importances_" '
112 'attributes')
113
114 # Get ranks
115 if coefs.ndim > 1:
116 ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
117 else:
118 ranks = np.argsort(safe_sqr(coefs))
119
120 # for sparse case ranks is matrix
121 ranks = np.ravel(ranks)
122
123 # Eliminate the worse features
124 threshold =\
125 min(step[step_i], np.sum(support_) - n_features_to_select)
126
127 # Compute step score on the previous selection iteration
128 # because 'estimator' must use features
129 # that have not been eliminated yet
130 if step_score:
131 self.scores_.append(step_score(estimator, features))
132 support_[features[ranks][:threshold]] = False
133 ranking_[np.logical_not(support_)] += 1
134
135 step_i += 1
136
137 # Set final attributes
138 features = np.arange(n_features)[support_]
139 self.estimator_ = clone(self.estimator)
140 self.estimator_.fit(X[:, features], y)
141
142 # Compute step score when only n_features_to_select features left
143 if step_score:
144 self.scores_.append(step_score(self.estimator_, features))
145 self.n_features_ = support_.sum()
146 self.support_ = support_
147 self.ranking_ = ranking_
148
149 return self
150
151
152 class DyRFECV(RFECV, MetaEstimatorMixin):
153 """
154 Compared with RFECV, DyRFECV offers flexiable `step` to eleminate
155 features, in the format of list, while RFECV supports only fixed number
156 of `step`.
157
158 Parameters
159 ----------
160 estimator : object
161 A supervised learning estimator with a ``fit`` method that provides
162 information about feature importance either through a ``coef_``
163 attribute or through a ``feature_importances_`` attribute.
164 step : int or float, optional (default=1)
165 If greater than or equal to 1, then ``step`` corresponds to the
166 (integer) number of features to remove at each iteration.
167 If within (0.0, 1.0), then ``step`` corresponds to the percentage
168 (rounded down) of features to remove at each iteration.
169 If list, a series of step to remove at each iteration. iteration stopes
170 when finishing all steps
171 Note that the last iteration may remove fewer than ``step`` features in
172 order to reach ``min_features_to_select``.
173 min_features_to_select : int, (default=1)
174 The minimum number of features to be selected. This number of features
175 will always be scored, even if the difference between the original
176 feature count and ``min_features_to_select`` isn't divisible by
177 ``step``.
178 cv : int, cross-validation generator or an iterable, optional
179 Determines the cross-validation splitting strategy.
180 Possible inputs for cv are:
181 - None, to use the default 3-fold cross-validation,
182 - integer, to specify the number of folds.
183 - :term:`CV splitter`,
184 - An iterable yielding (train, test) splits as arrays of indices.
185 For integer/None inputs, if ``y`` is binary or multiclass,
186 :class:`sklearn.model_selection.StratifiedKFold` is used. If the
187 estimator is a classifier or if ``y`` is neither binary nor multiclass,
188 :class:`sklearn.model_selection.KFold` is used.
189 Refer :ref:`User Guide <cross_validation>` for the various
190 cross-validation strategies that can be used here.
191 .. versionchanged:: 0.20
192 ``cv`` default value of None will change from 3-fold to 5-fold
193 in v0.22.
194 scoring : string, callable or None, optional, (default=None)
195 A string (see model evaluation documentation) or
196 a scorer callable object / function with signature
197 ``scorer(estimator, X, y)``.
198 verbose : int, (default=0)
199 Controls verbosity of output.
200 n_jobs : int or None, optional (default=None)
201 Number of cores to run in parallel while fitting across folds.
202 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
203 ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
204 for more details.
205 """
206 def __init__(self, estimator, step=1, min_features_to_select=1, cv='warn',
207 scoring=None, verbose=0, n_jobs=None):
208 super(DyRFECV, self).__init__(
209 estimator, step=step,
210 min_features_to_select=min_features_to_select,
211 cv=cv, scoring=scoring, verbose=verbose,
212 n_jobs=n_jobs)
213
214 def fit(self, X, y, groups=None):
215 """Fit the RFE model and automatically tune the number of selected
216 features.
217 Parameters
218 ----------
219 X : {array-like, sparse matrix}, shape = [n_samples, n_features]
220 Training vector, where `n_samples` is the number of samples and
221 `n_features` is the total number of features.
222 y : array-like, shape = [n_samples]
223 Target values (integers for classification, real numbers for
224 regression).
225 groups : array-like, shape = [n_samples], optional
226 Group labels for the samples used while splitting the dataset into
227 train/test set.
228 """
229 if type(self.step) is not list:
230 return super(DyRFECV, self).fit(X, y, groups)
231
232 X, y = check_X_y(X, y, "csr")
233
234 # Initialization
235 cv = check_cv(self.cv, y, is_classifier(self.estimator))
236 scorer = check_scoring(self.estimator, scoring=self.scoring)
237 n_features = X.shape[1]
238
239 step = []
240 for s in self.step:
241 if 0.0 < s < 1.0:
242 step.append(int(max(1, s * n_features)))
243 else:
244 step.append(int(s))
245 if s <= 0:
246 raise ValueError("Step must be >0")
247
248 # Build an RFE object, which will evaluate and score each possible
249 # feature count, down to self.min_features_to_select
250 rfe = DyRFE(estimator=self.estimator,
251 n_features_to_select=self.min_features_to_select,
252 step=self.step, verbose=self.verbose)
253
254 # Determine the number of subsets of features by fitting across
255 # the train folds and choosing the "features_to_select" parameter
256 # that gives the least averaged error across all folds.
257
258 # Note that joblib raises a non-picklable error for bound methods
259 # even if n_jobs is set to 1 with the default multiprocessing
260 # backend.
261 # This branching is done so that to
262 # make sure that user code that sets n_jobs to 1
263 # and provides bound methods as scorers is not broken with the
264 # addition of n_jobs parameter in version 0.18.
265
266 if effective_n_jobs(self.n_jobs) == 1:
267 parallel, func = list, _rfe_single_fit
268 else:
269 parallel = Parallel(n_jobs=self.n_jobs)
270 func = delayed(_rfe_single_fit)
271
272 scores = parallel(
273 func(rfe, self.estimator, X, y, train, test, scorer)
274 for train, test in cv.split(X, y, groups))
275
276 scores = np.sum(scores, axis=0)
277 diff = int(scores.shape[0]) - len(step)
278 if diff > 0:
279 step = np.r_[step, [step[-1]] * diff]
280 scores_rev = scores[::-1]
281 argmax_idx = len(scores) - np.argmax(scores_rev) - 1
282 n_features_to_select = max(
283 n_features - sum(step[:argmax_idx]),
284 self.min_features_to_select)
285
286 # Re-execute an elimination with best_k over the whole set
287 rfe = DyRFE(estimator=self.estimator,
288 n_features_to_select=n_features_to_select, step=self.step,
289 verbose=self.verbose)
290
291 rfe.fit(X, y)
292
293 # Set final attributes
294 self.support_ = rfe.support_
295 self.n_features_ = rfe.n_features_
296 self.ranking_ = rfe.ranking_
297 self.estimator_ = clone(self.estimator)
298 self.estimator_.fit(self.transform(X), y)
299
300 # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1
301 # here, the scores are normalized by get_n_splits(X, y)
302 self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups)
303 return self
304
305
306 class MyPipeline(pipeline.Pipeline):
307 """
308 Extend pipeline object to have feature_importances_ attribute
309 """
310 def fit(self, X, y=None, **fit_params):
311 super(MyPipeline, self).fit(X, y, **fit_params)
312 estimator = self.steps[-1][-1]
313 if hasattr(estimator, 'coef_'):
314 coefs = estimator.coef_
315 else:
316 coefs = getattr(estimator, 'feature_importances_', None)
317 if coefs is None:
318 raise RuntimeError('The estimator in the pipeline does not expose '
319 '"coef_" or "feature_importances_" '
320 'attributes')
321 self.feature_importances_ = coefs
322 return self
323
324
325 class MyimbPipeline(imbPipeline):
326 """
327 Extend imblance pipeline object to have feature_importances_ attribute
328 """
329 def fit(self, X, y=None, **fit_params):
330 super(MyimbPipeline, self).fit(X, y, **fit_params)
331 estimator = self.steps[-1][-1]
332 if hasattr(estimator, 'coef_'):
333 coefs = estimator.coef_
334 else:
335 coefs = getattr(estimator, 'feature_importances_', None)
336 if coefs is None:
337 raise RuntimeError('The estimator in the pipeline does not expose '
338 '"coef_" or "feature_importances_" '
339 'attributes')
340 self.feature_importances_ = coefs
341 return self
342
343
344 def check_feature_importances(estimator):
345 """
346 For pipeline object which has no feature_importances_ property,
347 this function returns the same comfigured pipeline object with
348 attached the last estimator's feature_importances_.
349 """
350 if estimator.__class__.__module__ == 'sklearn.pipeline':
351 pipeline_steps = estimator.get_params()['steps']
352 estimator = MyPipeline(pipeline_steps)
353 elif estimator.__class__.__module__ == 'imblearn.pipeline':
354 pipeline_steps = estimator.get_params()['steps']
355 estimator = MyimbPipeline(pipeline_steps)
356 else:
357 return estimator