diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index a60229684..8c6d2a217 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -17,6 +17,10 @@ The CHANGELOG for the current development version is available at ##### New Features + +The fit method of the SequentialFeatureSelector now optionally accepts **fit_params for the estimator that is used for the feature selection. ([#350](https://github.com/rasbt/mlxtend/pull/350) by Zach Griffith) + + - - ##### Changes diff --git a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb index 2185a7715..107f4fc04 100644 --- a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb +++ b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb @@ -1435,7 +1435,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -1555,7 +1555,7 @@ "\n", "
\n", "\n", - "*fit(X, y)*\n", + "*fit(X, y, **fit_params)*\n", "\n", "Perform feature selection and learn model from training data.\n", "\n", @@ -1570,6 +1570,10 @@ "\n", " Target values.\n", "\n", + "- `fit_params` : dict of string -> object, optional\n", + "\n", + " Parameters to pass to to the fit method of classifier.\n", + "\n", "**Returns**\n", "\n", "- `self` : object\n", @@ -1577,7 +1581,7 @@ "\n", "
\n", "\n", - "*fit_transform(X, y)*\n", + "*fit_transform(X, y, **fit_params)*\n", "\n", "Fit to training data then reduce X to its most important features.\n", "\n", @@ -1588,6 +1592,10 @@ " Training vectors, where n_samples is the number of samples and\n", " n_features is the number of features.\n", "\n", + "- `fit_params` : dict of string -> object, optional\n", + "\n", + " Parameters to pass to to the fit method of classifier.\n", + "\n", "**Returns**\n", "\n", "Reduced feature subset of X, shape={n_samples, k_features}\n", diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index ea93ab70d..2f8aed793 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -22,16 +22,17 @@ from sklearn.externals.joblib import Parallel, delayed -def _calc_score(selector, X, y, indices): +def _calc_score(selector, X, y, indices, **fit_params): if selector.cv: scores = cross_val_score(selector.est_, X[:, indices], y, cv=selector.cv, scoring=selector.scorer, n_jobs=1, - pre_dispatch=selector.pre_dispatch) + pre_dispatch=selector.pre_dispatch, + fit_params=fit_params) else: - selector.est_.fit(X[:, indices], y) + selector.est_.fit(X[:, indices], y, **fit_params) scores = np.array([selector.scorer(selector.est_, X[:, indices], y)]) return indices, scores @@ -169,7 +170,7 @@ def __init__(self, estimator, k_features=1, # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False - def fit(self, X, y): + def fit(self, X, y, **fit_params): """Perform feature selection and learn model from training data. Parameters @@ -179,6 +180,8 @@ def fit(self, X, y): n_features is the number of features. y : array-like, shape = [n_samples] Target values. + fit_params : dict of string -> object, optional + Parameters to pass to to the fit method of classifier. Returns ------- @@ -248,7 +251,7 @@ def fit(self, X, y): k_to_select = min_k k_idx = tuple(range(X.shape[1])) k = len(k_idx) - k_idx, k_score = _calc_score(self, X, y, k_idx) + k_idx, k_score = _calc_score(self, X, y, k_idx, **fit_params) self.subsets_[k] = { 'feature_idx': k_idx, 'cv_scores': k_score, @@ -266,14 +269,16 @@ def fit(self, X, y): orig_set=orig_set, subset=prev_subset, X=X, - y=y + y=y, + **fit_params ) else: k_idx, k_score, cv_scores = self._exclusion( feature_set=prev_subset, X=X, - y=y + y=y, + **fit_params ) if self.floating: @@ -298,7 +303,8 @@ def fit(self, X, y): feature_set=k_idx, fixed_feature=new_feature, X=X, - y=y + y=y, + **fit_params ) else: @@ -306,7 +312,8 @@ def fit(self, X, y): orig_set=orig_set - {new_feature}, subset=set(k_idx), X=X, - y=y + y=y, + **fit_params ) if k_score_c is not None and k_score_c > k_score: @@ -395,7 +402,7 @@ def fit(self, X, y): self.fitted = True return self - def _inclusion(self, orig_set, subset, X, y, ignore_feature=None): + def _inclusion(self, orig_set, subset, X, y, ignore_feature=None, **fit_params): all_avg_scores = [] all_cv_scores = [] all_subsets = [] @@ -407,7 +414,7 @@ def _inclusion(self, orig_set, subset, X, y, ignore_feature=None): parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch) work = parallel(delayed(_calc_score) - (self, X, y, tuple(subset | {feature})) + (self, X, y, tuple(subset | {feature}), **fit_params) for feature in remaining if feature != ignore_feature) @@ -422,7 +429,7 @@ def _inclusion(self, orig_set, subset, X, y, ignore_feature=None): all_cv_scores[best]) return res - def _exclusion(self, feature_set, X, y, fixed_feature=None): + def _exclusion(self, feature_set, X, y, fixed_feature=None, **fit_params): n = len(feature_set) res = (None, None, None) if n > 1: @@ -433,7 +440,7 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None): n_jobs = min(self.n_jobs, features) parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch) - work = parallel(delayed(_calc_score)(self, X, y, p) + work = parallel(delayed(_calc_score)(self, X, y, p, **fit_params) for p in combinations(feature_set, r=n - 1) if not fixed_feature or fixed_feature in set(p)) @@ -466,7 +473,7 @@ def transform(self, X): self._check_fitted() return X[:, self.k_feature_idx_] - def fit_transform(self, X, y): + def fit_transform(self, X, y, **fit_params): """Fit to training data then reduce X to its most important features. Parameters @@ -474,13 +481,15 @@ def fit_transform(self, X, y): X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. + fit_params : dict of string -> object, optional + Parameters to pass to to the fit method of classifier. Returns ------- Reduced feature subset of X, shape={n_samples, k_features} """ - self.fit(X, y) + self.fit(X, y, **fit_params) return self.transform(X) def get_metric_dict(self, confidence_interval=0.95): diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py index ffdabbecc..6c2f6ba99 100644 --- a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py +++ b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py @@ -61,6 +61,18 @@ def test_run_default(): assert sfs.k_feature_idx_ == (3,) +def test_fit_params(): + iris = load_iris() + X = iris.data + y = iris.target + sample_weight = np.ones(X.shape[0]) + forest = RandomForestClassifier(n_estimators=100, random_state=123) + sfs = SFS(estimator=forest, + verbose=0) + sfs.fit(X, y, sample_weight=sample_weight) + assert sfs.k_feature_idx_ == (3,) + + def test_kfeatures_type_1(): iris = load_iris() X = iris.data