Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds fit_params support for SequentialFeatureSelector #350

Merged
merged 4 commits into from
Mar 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ The CHANGELOG for the current development version is available at

##### New Features


The fit method of the SequentialFeatureSelector now optionally accepts **fit_params for the estimator that is used for the feature selection. ([#350](https://github.com/rasbt/mlxtend/pull/350) by Zach Griffith)


- -

##### Changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1435,7 +1435,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1555,7 +1555,7 @@
"\n",
"<hr>\n",
"\n",
"*fit(X, y)*\n",
"*fit(X, y, **fit_params)*\n",
"\n",
"Perform feature selection and learn model from training data.\n",
"\n",
Expand All @@ -1570,14 +1570,18 @@
"\n",
" Target values.\n",
"\n",
"- `fit_params` : dict of string -> object, optional\n",
"\n",
" Parameters to pass to to the fit method of classifier.\n",
"\n",
"**Returns**\n",
"\n",
"- `self` : object\n",
"\n",
"\n",
"<hr>\n",
"\n",
"*fit_transform(X, y)*\n",
"*fit_transform(X, y, **fit_params)*\n",
"\n",
"Fit to training data then reduce X to its most important features.\n",
"\n",
Expand All @@ -1588,6 +1592,10 @@
" Training vectors, where n_samples is the number of samples and\n",
" n_features is the number of features.\n",
"\n",
"- `fit_params` : dict of string -> object, optional\n",
"\n",
" Parameters to pass to to the fit method of classifier.\n",
"\n",
"**Returns**\n",
"\n",
"Reduced feature subset of X, shape={n_samples, k_features}\n",
Expand Down
39 changes: 24 additions & 15 deletions mlxtend/feature_selection/sequential_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,17 @@
from sklearn.externals.joblib import Parallel, delayed


def _calc_score(selector, X, y, indices):
def _calc_score(selector, X, y, indices, **fit_params):
if selector.cv:
scores = cross_val_score(selector.est_,
X[:, indices], y,
cv=selector.cv,
scoring=selector.scorer,
n_jobs=1,
pre_dispatch=selector.pre_dispatch)
pre_dispatch=selector.pre_dispatch,
fit_params=fit_params)
else:
selector.est_.fit(X[:, indices], y)
selector.est_.fit(X[:, indices], y, **fit_params)
scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
return indices, scores

Expand Down Expand Up @@ -169,7 +170,7 @@ def __init__(self, estimator, k_features=1,
# don't mess with this unless testing
self._TESTING_INTERRUPT_MODE = False

def fit(self, X, y):
def fit(self, X, y, **fit_params):
"""Perform feature selection and learn model from training data.

Parameters
Expand All @@ -179,6 +180,8 @@ def fit(self, X, y):
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
fit_params : dict of string -> object, optional
Parameters to pass to to the fit method of classifier.

Returns
-------
Expand Down Expand Up @@ -248,7 +251,7 @@ def fit(self, X, y):
k_to_select = min_k
k_idx = tuple(range(X.shape[1]))
k = len(k_idx)
k_idx, k_score = _calc_score(self, X, y, k_idx)
k_idx, k_score = _calc_score(self, X, y, k_idx, **fit_params)
self.subsets_[k] = {
'feature_idx': k_idx,
'cv_scores': k_score,
Expand All @@ -266,14 +269,16 @@ def fit(self, X, y):
orig_set=orig_set,
subset=prev_subset,
X=X,
y=y
y=y,
**fit_params
)
else:

k_idx, k_score, cv_scores = self._exclusion(
feature_set=prev_subset,
X=X,
y=y
y=y,
**fit_params
)

if self.floating:
Expand All @@ -298,15 +303,17 @@ def fit(self, X, y):
feature_set=k_idx,
fixed_feature=new_feature,
X=X,
y=y
y=y,
**fit_params
)

else:
k_idx_c, k_score_c, cv_scores_c = self._inclusion(
orig_set=orig_set - {new_feature},
subset=set(k_idx),
X=X,
y=y
y=y,
**fit_params
)

if k_score_c is not None and k_score_c > k_score:
Expand Down Expand Up @@ -395,7 +402,7 @@ def fit(self, X, y):
self.fitted = True
return self

def _inclusion(self, orig_set, subset, X, y, ignore_feature=None):
def _inclusion(self, orig_set, subset, X, y, ignore_feature=None, **fit_params):
all_avg_scores = []
all_cv_scores = []
all_subsets = []
Expand All @@ -407,7 +414,7 @@ def _inclusion(self, orig_set, subset, X, y, ignore_feature=None):
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
pre_dispatch=self.pre_dispatch)
work = parallel(delayed(_calc_score)
(self, X, y, tuple(subset | {feature}))
(self, X, y, tuple(subset | {feature}), **fit_params)
for feature in remaining
if feature != ignore_feature)

Expand All @@ -422,7 +429,7 @@ def _inclusion(self, orig_set, subset, X, y, ignore_feature=None):
all_cv_scores[best])
return res

def _exclusion(self, feature_set, X, y, fixed_feature=None):
def _exclusion(self, feature_set, X, y, fixed_feature=None, **fit_params):
n = len(feature_set)
res = (None, None, None)
if n > 1:
Expand All @@ -433,7 +440,7 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None):
n_jobs = min(self.n_jobs, features)
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
pre_dispatch=self.pre_dispatch)
work = parallel(delayed(_calc_score)(self, X, y, p)
work = parallel(delayed(_calc_score)(self, X, y, p, **fit_params)
for p in combinations(feature_set, r=n - 1)
if not fixed_feature or fixed_feature in set(p))

Expand Down Expand Up @@ -466,21 +473,23 @@ def transform(self, X):
self._check_fitted()
return X[:, self.k_feature_idx_]

def fit_transform(self, X, y):
def fit_transform(self, X, y, **fit_params):
"""Fit to training data then reduce X to its most important features.

Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
fit_params : dict of string -> object, optional
Parameters to pass to to the fit method of classifier.

Returns
-------
Reduced feature subset of X, shape={n_samples, k_features}

"""
self.fit(X, y)
self.fit(X, y, **fit_params)
return self.transform(X)

def get_metric_dict(self, confidence_interval=0.95):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,18 @@ def test_run_default():
assert sfs.k_feature_idx_ == (3,)


def test_fit_params():
iris = load_iris()
X = iris.data
y = iris.target
sample_weight = np.ones(X.shape[0])
forest = RandomForestClassifier(n_estimators=100, random_state=123)
sfs = SFS(estimator=forest,
verbose=0)
sfs.fit(X, y, sample_weight=sample_weight)
assert sfs.k_feature_idx_ == (3,)


def test_kfeatures_type_1():
iris = load_iris()
X = iris.data
Expand Down