Avoid ValueError in parallel computing of large arrays

This PR introduces the optional *max_nbytes* parameter on *OneVsRestClassifier", OneVsOneClassifier" and OutputCodeClassifier" multiclass learning algorithms within *multiclass.py*. Such parameter is in addition to the already existing *n_jobs* one and might be useful when dealing with a large training set processed by concurrently running jobs defined by *n_jobs* > 0 or by *n_jobs* = -1 (meaning that the number of jobs is set to the number of CPU cores). In this case, [Parallel](https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation) is called with the default "loky" backend, that [implements multi-processing](https://joblib.readthedocs.io/en/latest/parallel.html#thread-based-parallelism-vs-process-based-parallelism); *Parallel* also sets a default 1-megabyte [threshold](https://joblib.readthedocs.io/en/latest/parallel.html#automated-array-to-memmap-conversion) on the size of arrays passed to the workers. Such parameter may not be enough for large arrays and could break the job with exception **ValueError: UPDATEIFCOPY base is read-only**. *Parallel* uses *max_nbytes* to control this threshold. Through this fix, the multiclass classifiers will offer the optional possibility to customize the max size of arrays. Fixes scikit-learn#6614 Expected to also fix scikit-learn#4597
Ircama · Nov 21, 2019 · 311d1ba · 311d1ba
1 parent 1c546cd
commit 311d1ba
Showing 1 changed file with 38 additions and 9 deletions.
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
@@ -163,6 +163,13 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+    max_nbytes :  int, str, or None, optional, 1M by default
+        Threshold on the size of arrays passed to the workers that triggers
+        automated memory mapping in temp_folder. Can be an int in Bytes, or
+        a human-readable string, e.g., '1M' for 1 megabyte. Use None to disable
+        memmapping of large arrays. Only active when backend=”loky” or
+        “multiprocessing”.
+
     Attributes
     ----------
     estimators_ : list of `n_classes` estimators
@@ -200,9 +207,10 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
     array([2, 0, 1])
 
     """
-    def __init__(self, estimator, n_jobs=None):
+    def __init__(self, estimator, n_jobs=None, max_nbytes='1M'):
         self.estimator = estimator
         self.n_jobs = n_jobs
+        self.max_nbytes = max_nbytes
 
     def fit(self, X, y):
         """Fit underlying estimators.
@@ -232,7 +240,9 @@ def fit(self, X, y):
         # In cases where individual estimators are very fast to train setting
         # n_jobs > 1 in can results in slower performance due to the overhead
         # of spawning threads.  See joblib issue #112.
-        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
+        self.estimators_ = Parallel(n_jobs=self.n_jobs,
+                                    max_nbytes=self.max_nbytes
+                                    )(delayed(_fit_binary)(
             self.estimator, X, column, classes=[
                 "not %s" % self.label_binarizer_.classes_[i],
                 self.label_binarizer_.classes_[i]])
@@ -290,7 +300,8 @@ def partial_fit(self, X, y, classes=None):
         Y = Y.tocsc()
         columns = (col.toarray().ravel() for col in Y.T)
 
-        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
+        self.estimators_ = Parallel(n_jobs=self.n_jobs,
+                                    max_nbytes=self.max_nbytes)(
             delayed(_partial_fit_binary)(estimator, X, column)
             for estimator, column in zip(self.estimators_, columns))
 
@@ -486,6 +497,13 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+    max_nbytes :  int, str, or None, optional, 1M by default
+        Threshold on the size of arrays passed to the workers that triggers
+        automated memory mapping in temp_folder. Can be an int in Bytes, or
+        a human-readable string, e.g., ‘1M’ for 1 megabyte. Use None to disable
+        memmapping of large arrays. Only active when backend=”loky” or
+        “multiprocessing”.
+
     Attributes
     ----------
     estimators_ : list of ``n_classes * (n_classes - 1) / 2`` estimators
@@ -502,9 +520,10 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         ``None`` when ``estimator`` does not have ``_pairwise`` attribute.
     """
 
-    def __init__(self, estimator, n_jobs=None):
+    def __init__(self, estimator, n_jobs=None, max_nbytes='1M'):
         self.estimator = estimator
         self.n_jobs = n_jobs
+        self.max_nbytes = max_nbytes
 
     def fit(self, X, y):
         """Fit underlying estimators.
@@ -529,7 +548,8 @@ def fit(self, X, y):
             raise ValueError("OneVsOneClassifier can not be fit when only one"
                              " class is present.")
         n_classes = self.classes_.shape[0]
-        estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)(
+        estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs,
+                                                 max_nbytes=self.max_nbytes)(
             delayed(_fit_ovo_binary)
             (self.estimator, X, y, self.classes_[i], self.classes_[j])
             for i in range(n_classes) for j in range(i + 1, n_classes)))))
@@ -581,8 +601,8 @@ def partial_fit(self, X, y, classes=None):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
         check_classification_targets(y)
         combinations = itertools.combinations(range(self.n_classes_), 2)
-        self.estimators_ = Parallel(
-            n_jobs=self.n_jobs)(
+        self.estimators_ = Parallel(n_jobs=self.n_jobs,
+                                    max_nbytes=self.max_nbytes)(
                 delayed(_partial_fit_ovo_binary)(
                     estimator, X, y, self.classes_[i], self.classes_[j])
                 for estimator, (i, j) in zip(self.estimators_,
@@ -696,6 +716,13 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+    max_nbytes :  int, str, or None, optional, 1M by default
+        Threshold on the size of arrays passed to the workers that triggers
+        automated memory mapping in temp_folder. Can be an int in Bytes, or
+        a human-readable string, e.g., ‘1M’ for 1 megabyte. Use None to disable
+        memmapping of large arrays. Only active when backend=”loky” or
+        “multiprocessing”.
+
     Attributes
     ----------
     estimators_ : list of `int(n_classes * code_size)` estimators
@@ -741,11 +768,12 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     """
 
     def __init__(self, estimator, code_size=1.5, random_state=None,
-                 n_jobs=None):
+                 n_jobs=None, max_nbytes='1M'):
         self.estimator = estimator
         self.code_size = code_size
         self.random_state = random_state
         self.n_jobs = n_jobs
+        self.max_nbytes = max_nbytes
 
     def fit(self, X, y):
         """Fit underlying estimators.
@@ -790,7 +818,8 @@ def fit(self, X, y):
         Y = np.array([self.code_book_[classes_index[y[i]]]
                       for i in range(X.shape[0])], dtype=np.int)
 
-        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
+        self.estimators_ = Parallel(n_jobs=self.n_jobs,
+                                    max_nbytes=self.max_nbytes)(
             delayed(_fit_binary)(self.estimator, X, Y[:, i])
             for i in range(Y.shape[1]))