[data/preprocessors] simple imputer raise error on no stats for column (

#49425)   ## Why are these changes needed? This PR fixes two errors: 1. when the strategy is constant, we try to check if the column is categorical, but if the column does not exists in the dataframe, it fails with a `KeyError` 2. If the preprocessor was unable to compute statistics because all the columns did not have any value, the preprocessor fails with an unintuitive error in the dropna. ## Related issue number  ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( Signed-off-by: Martin Bomio <martinbomio@spotify.com>
ray-project · Jan 6, 2025 · f4c4c81 · f4c4c81
1 parent 951fc14
commit f4c4c81
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 19 deletions.
diff --git a/python/ray/data/preprocessors/imputer.py b/python/ray/data/preprocessors/imputer.py
@@ -117,31 +117,38 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
         return self
 
     def _transform_pandas(self, df: pd.DataFrame):
-        if self.strategy == "mean":
-            new_values = {
-                column: self.stats_[f"mean({column})"] for column in self.columns
-            }
-        elif self.strategy == "most_frequent":
-            new_values = {
-                column: self.stats_[f"most_frequent({column})"]
-                for column in self.columns
-            }
-        elif self.strategy == "constant":
-            new_values = {column: self.fill_value for column in self.columns}
-            for column, value in new_values.items():
-                if is_categorical_dtype(df.dtypes[column]):
-                    df[column] = df[column].cat.add_categories(value)
+        for column in self.columns:
+            value = self._get_fill_value(column)
+
+            if value is None:
+                raise ValueError(
+                    f"Column {column} has no fill value. "
+                    "Check the data used to fit the SimpleImputer."
+                )
 
-        for column_name in new_values:
-            if column_name not in df.columns:
+            if column not in df.columns:
                 # Create the column with the fill_value if it doesn't exist
-                df[column_name] = new_values[column_name]
+                df[column] = value
             else:
-                # Fill NaN (empty) values in the existing column with the fill_value
-                df[column_name].fillna(new_values[column_name], inplace=True)
+                if is_categorical_dtype(df.dtypes[column]):
+                    df[column] = df[column].cat.add_categories([value])
+                df[column].fillna(value, inplace=True)
 
         return df
 
+    def _get_fill_value(self, column):
+        if self.strategy == "mean":
+            return self.stats_[f"mean({column})"]
+        elif self.strategy == "most_frequent":
+            return self.stats_[f"most_frequent({column})"]
+        elif self.strategy == "constant":
+            return self.fill_value
+        else:
+            raise ValueError(
+                f"Strategy {self.strategy} is not supported. "
+                "Supported values are: {self._valid_strategies}"
+            )
+
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(columns={self.columns!r}, "

diff --git a/python/ray/data/tests/preprocessors/test_imputer.py b/python/ray/data/tests/preprocessors/test_imputer.py
@@ -126,6 +126,52 @@ def test_simple_imputer():
     assert constant_out_df.equals(constant_expected_df)
 
 
+def test_imputer_all_nan_raise_error():
+    data = {
+        "A": [np.nan, np.nan, np.nan, np.nan],
+    }
+    df = pd.DataFrame(data)
+    dataset = ray.data.from_pandas(df)
+
+    imputer = SimpleImputer(columns=["A"], strategy="mean")
+    imputer.fit(dataset)
+
+    with pytest.raises(ValueError):
+        imputer.transform_batch(df)
+
+
+def test_imputer_constant_categorical():
+    data = {
+        "A_cat": ["one", "two", None, "four"],
+    }
+    df = pd.DataFrame(data)
+    df["A_cat"] = df["A_cat"].astype("category")
+    dataset = ray.data.from_pandas(df)
+
+    imputer = SimpleImputer(columns=["A_cat"], strategy="constant", fill_value="three")
+    imputer.fit(dataset)
+
+    transformed_df = imputer.transform_batch(df)
+
+    expected = {
+        "A_cat": ["one", "two", "three", "four"],
+    }
+
+    for column in data.keys():
+        np.testing.assert_array_equal(transformed_df[column].values, expected[column])
+
+    df = pd.DataFrame({"A": [1, 2, 3, 4]})
+    transformed_df = imputer.transform_batch(df)
+
+    expected = {
+        "A": [1, 2, 3, 4],
+        "A_cat": ["three", "three", "three", "three"],
+    }
+
+    for column in df:
+        np.testing.assert_array_equal(transformed_df[column].values, expected[column])
+
+
 if __name__ == "__main__":
     import sys