Skip to content

Commit

Permalink
Avoid ValueError in parallel computing of large arrays
Browse files Browse the repository at this point in the history
This PR introduces the optional *max_nbytes* parameter on
*OneVsRestClassifier", OneVsOneClassifier" and OutputCodeClassifier"
multiclass learning algorithms within *multiclass.py*.

Such parameter is in addition to the already existing *n_jobs* one and
might be useful when dealing with a large training set processed by
concurrently running jobs defined by *n_jobs* > 0 or by *n_jobs* = -1
(meaning that the number of jobs is set to the number of CPU cores). In
this case,
[Parallel](https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation)
is called with the default "loky" backend, that [implements
multi-processing](https://joblib.readthedocs.io/en/latest/parallel.html#thread-based-parallelism-vs-process-based-parallelism);
*Parallel* also sets a default 1-megabyte
[threshold](https://joblib.readthedocs.io/en/latest/parallel.html#automated-array-to-memmap-conversion)
on the size of arrays passed to the workers. Such parameter may not be
enough for large arrays and could break the job with exception
**ValueError: UPDATEIFCOPY base is read-only**. *Parallel* uses
*max_nbytes* to control this threshold. Through this fix, the multiclass
classifiers will offer the optional possibility to customize the max
size of arrays.

Fixes scikit-learn#6614
Expected to also fix
scikit-learn#4597
  • Loading branch information
Ircama committed Nov 21, 2019
1 parent 1c546cd commit 311d1ba
Showing 1 changed file with 38 additions and 9 deletions.
47 changes: 38 additions & 9 deletions sklearn/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_nbytes : int, str, or None, optional, 1M by default
Threshold on the size of arrays passed to the workers that triggers
automated memory mapping in temp_folder. Can be an int in Bytes, or
a human-readable string, e.g., '1M' for 1 megabyte. Use None to disable
memmapping of large arrays. Only active when backend=”loky” or
“multiprocessing”.
Attributes
----------
estimators_ : list of `n_classes` estimators
Expand Down Expand Up @@ -200,9 +207,10 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
array([2, 0, 1])
"""
def __init__(self, estimator, n_jobs=None):
def __init__(self, estimator, n_jobs=None, max_nbytes='1M'):
self.estimator = estimator
self.n_jobs = n_jobs
self.max_nbytes = max_nbytes

def fit(self, X, y):
"""Fit underlying estimators.
Expand Down Expand Up @@ -232,7 +240,9 @@ def fit(self, X, y):
# In cases where individual estimators are very fast to train setting
# n_jobs > 1 in can results in slower performance due to the overhead
# of spawning threads. See joblib issue #112.
self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
self.estimators_ = Parallel(n_jobs=self.n_jobs,
max_nbytes=self.max_nbytes
)(delayed(_fit_binary)(
self.estimator, X, column, classes=[
"not %s" % self.label_binarizer_.classes_[i],
self.label_binarizer_.classes_[i]])
Expand Down Expand Up @@ -290,7 +300,8 @@ def partial_fit(self, X, y, classes=None):
Y = Y.tocsc()
columns = (col.toarray().ravel() for col in Y.T)

self.estimators_ = Parallel(n_jobs=self.n_jobs)(
self.estimators_ = Parallel(n_jobs=self.n_jobs,
max_nbytes=self.max_nbytes)(
delayed(_partial_fit_binary)(estimator, X, column)
for estimator, column in zip(self.estimators_, columns))

Expand Down Expand Up @@ -486,6 +497,13 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_nbytes : int, str, or None, optional, 1M by default
Threshold on the size of arrays passed to the workers that triggers
automated memory mapping in temp_folder. Can be an int in Bytes, or
a human-readable string, e.g., ‘1M’ for 1 megabyte. Use None to disable
memmapping of large arrays. Only active when backend=”loky” or
“multiprocessing”.
Attributes
----------
estimators_ : list of ``n_classes * (n_classes - 1) / 2`` estimators
Expand All @@ -502,9 +520,10 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
``None`` when ``estimator`` does not have ``_pairwise`` attribute.
"""

def __init__(self, estimator, n_jobs=None):
def __init__(self, estimator, n_jobs=None, max_nbytes='1M'):
self.estimator = estimator
self.n_jobs = n_jobs
self.max_nbytes = max_nbytes

def fit(self, X, y):
"""Fit underlying estimators.
Expand All @@ -529,7 +548,8 @@ def fit(self, X, y):
raise ValueError("OneVsOneClassifier can not be fit when only one"
" class is present.")
n_classes = self.classes_.shape[0]
estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)(
estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs,
max_nbytes=self.max_nbytes)(
delayed(_fit_ovo_binary)
(self.estimator, X, y, self.classes_[i], self.classes_[j])
for i in range(n_classes) for j in range(i + 1, n_classes)))))
Expand Down Expand Up @@ -581,8 +601,8 @@ def partial_fit(self, X, y, classes=None):
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
check_classification_targets(y)
combinations = itertools.combinations(range(self.n_classes_), 2)
self.estimators_ = Parallel(
n_jobs=self.n_jobs)(
self.estimators_ = Parallel(n_jobs=self.n_jobs,
max_nbytes=self.max_nbytes)(
delayed(_partial_fit_ovo_binary)(
estimator, X, y, self.classes_[i], self.classes_[j])
for estimator, (i, j) in zip(self.estimators_,
Expand Down Expand Up @@ -696,6 +716,13 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_nbytes : int, str, or None, optional, 1M by default
Threshold on the size of arrays passed to the workers that triggers
automated memory mapping in temp_folder. Can be an int in Bytes, or
a human-readable string, e.g., ‘1M’ for 1 megabyte. Use None to disable
memmapping of large arrays. Only active when backend=”loky” or
“multiprocessing”.
Attributes
----------
estimators_ : list of `int(n_classes * code_size)` estimators
Expand Down Expand Up @@ -741,11 +768,12 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
"""

def __init__(self, estimator, code_size=1.5, random_state=None,
n_jobs=None):
n_jobs=None, max_nbytes='1M'):
self.estimator = estimator
self.code_size = code_size
self.random_state = random_state
self.n_jobs = n_jobs
self.max_nbytes = max_nbytes

def fit(self, X, y):
"""Fit underlying estimators.
Expand Down Expand Up @@ -790,7 +818,8 @@ def fit(self, X, y):
Y = np.array([self.code_book_[classes_index[y[i]]]
for i in range(X.shape[0])], dtype=np.int)

self.estimators_ = Parallel(n_jobs=self.n_jobs)(
self.estimators_ = Parallel(n_jobs=self.n_jobs,
max_nbytes=self.max_nbytes)(
delayed(_fit_binary)(self.estimator, X, Y[:, i])
for i in range(Y.shape[1]))

Expand Down

0 comments on commit 311d1ba

Please sign in to comment.