Skip to content

Commit

Permalink
Allowing optional list of Parallel keyworded parameters
Browse files Browse the repository at this point in the history
Changing *OneVsRestClassifier", OneVsOneClassifier" and
OutputCodeClassifier" multiclass learning algorithms within
multiclass.py, by replacing "n_jobs" parameter with keyworded,
variable-length argument list, in order to allow any "Parallel"
parameter to be passed, as well as support "parallel_backend"
context manager.

"n_jobs" remains one of the possible parameters, but other ones can be
added, including "max_nbytes", which might be useful in order to avoid
ValueError when dealing with a large training set processed by
concurrently running jobs defined by *n_jobs* > 0 or by *n_jobs* = -1.

More specifically, in parallel computing of large arrays with "loky"
backend,
[Parallel](https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation)
sets a default 1-megabyte
[threshold](https://joblib.readthedocs.io/en/latest/parallel.html#automated-array-to-memmap-conversion)
on the size of arrays passed to the workers. Such parameter may not be
enough for large arrays and could break jobs with exception
**ValueError: UPDATEIFCOPY base is read-only**.

*Parallel* uses *max_nbytes* to control this threshold.

Through this fix, the multiclass classifiers will offer the optional
possibility to customize the max size of arrays.

Fixes scikit-learn#6614
See also scikit-learn#4597
  • Loading branch information
Ircama committed Nov 23, 2019
1 parent 308a54e commit eeb7707
Showing 1 changed file with 33 additions and 27 deletions.
60 changes: 33 additions & 27 deletions sklearn/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,13 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
An estimator object implementing :term:`fit` and one of
:term:`decision_function` or :term:`predict_proba`.
n_jobs : int or None, optional (default=None)
The number of jobs to use for the computation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
parallel_params : keyworded, variable-length argument list
Optional list of keyworded parameters to be passed to
:class:`joblib.Parallel`. For instance, ``n_jobs`` (the number of jobs
to use for the computation, see :term:`Glossary <n_jobs>`),
``max_nbytes``, ``backend``, etc. (see also :class:`joblib.Parallel`
reference documentation
(https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html).
Attributes
----------
Expand Down Expand Up @@ -200,9 +202,9 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
array([2, 0, 1])
"""
def __init__(self, estimator, n_jobs=None):
def __init__(self, estimator, **parallel_params):
self.estimator = estimator
self.n_jobs = n_jobs
self.parallel_params = parallel_params

def fit(self, X, y):
"""Fit underlying estimators.
Expand Down Expand Up @@ -232,7 +234,8 @@ def fit(self, X, y):
# In cases where individual estimators are very fast to train setting
# n_jobs > 1 in can results in slower performance due to the overhead
# of spawning threads. See joblib issue #112.
self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
self.estimators_ = Parallel(**self.parallel_params
)(delayed(_fit_binary)(
self.estimator, X, column, classes=[
"not %s" % self.label_binarizer_.classes_[i],
self.label_binarizer_.classes_[i]])
Expand Down Expand Up @@ -290,7 +293,7 @@ def partial_fit(self, X, y, classes=None):
Y = Y.tocsc()
columns = (col.toarray().ravel() for col in Y.T)

self.estimators_ = Parallel(n_jobs=self.n_jobs)(
self.estimators_ = Parallel(**self.parallel_params)(
delayed(_partial_fit_binary)(estimator, X, column)
for estimator, column in zip(self.estimators_, columns))

Expand Down Expand Up @@ -480,11 +483,13 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
An estimator object implementing :term:`fit` and one of
:term:`decision_function` or :term:`predict_proba`.
n_jobs : int or None, optional (default=None)
The number of jobs to use for the computation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
parallel_params : keyworded, variable-length argument list
Optional list of keyworded parameters to be passed to
:class:`joblib.Parallel`. For instance, ``n_jobs`` (the number of jobs
to use for the computation, see :term:`Glossary <n_jobs>`),
``max_nbytes``, ``backend``, etc. (see also :class:`joblib.Parallel`
reference documentation
(https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html).
Attributes
----------
Expand All @@ -502,9 +507,9 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
``None`` when ``estimator`` does not have ``_pairwise`` attribute.
"""

def __init__(self, estimator, n_jobs=None):
def __init__(self, estimator, **parallel_params):
self.estimator = estimator
self.n_jobs = n_jobs
self.parallel_params = parallel_params

def fit(self, X, y):
"""Fit underlying estimators.
Expand All @@ -529,7 +534,7 @@ def fit(self, X, y):
raise ValueError("OneVsOneClassifier can not be fit when only one"
" class is present.")
n_classes = self.classes_.shape[0]
estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)(
estimators_indices = list(zip(*(Parallel(**self.parallel_params)(
delayed(_fit_ovo_binary)
(self.estimator, X, y, self.classes_[i], self.classes_[j])
for i in range(n_classes) for j in range(i + 1, n_classes)))))
Expand Down Expand Up @@ -581,8 +586,7 @@ def partial_fit(self, X, y, classes=None):
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
check_classification_targets(y)
combinations = itertools.combinations(range(self.n_classes_), 2)
self.estimators_ = Parallel(
n_jobs=self.n_jobs)(
self.estimators_ = Parallel(**self.parallel_params)(
delayed(_partial_fit_ovo_binary)(
estimator, X, y, self.classes_[i], self.classes_[j])
for estimator, (i, j) in zip(self.estimators_,
Expand Down Expand Up @@ -690,11 +694,13 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
random_state is the random number generator; If None, the random number
generator is the RandomState instance used by `np.random`.
n_jobs : int or None, optional (default=None)
The number of jobs to use for the computation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
parallel_params : keyworded, variable-length argument list
Optional list of keyworded parameters to be passed to
:class:`joblib.Parallel`. For instance, ``n_jobs`` (the number of jobs
to use for the computation, see :term:`Glossary <n_jobs>`),
``max_nbytes``, ``backend``, etc. (see also :class:`joblib.Parallel`
reference documentation
(https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html).
Attributes
----------
Expand Down Expand Up @@ -741,11 +747,11 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
"""

def __init__(self, estimator, code_size=1.5, random_state=None,
n_jobs=None):
**parallel_params):
self.estimator = estimator
self.code_size = code_size
self.random_state = random_state
self.n_jobs = n_jobs
self.parallel_params = parallel_params

def fit(self, X, y):
"""Fit underlying estimators.
Expand Down Expand Up @@ -790,7 +796,7 @@ def fit(self, X, y):
Y = np.array([self.code_book_[classes_index[y[i]]]
for i in range(X.shape[0])], dtype=np.int)

self.estimators_ = Parallel(n_jobs=self.n_jobs)(
self.estimators_ = Parallel(**self.parallel_params)(
delayed(_fit_binary)(self.estimator, X, Y[:, i])
for i in range(Y.shape[1]))

Expand Down

0 comments on commit eeb7707

Please sign in to comment.