Skip to content

Commit

Permalink
[data/preprocessors] simple imputer raise error on no stats for column (
Browse files Browse the repository at this point in the history
#49425)

<!-- Thank you for your contribution! Please review
https://github.com/ray-project/ray/blob/master/CONTRIBUTING.rst before
opening a pull request. -->

<!-- Please add a reviewer to the assignee section when you create a PR.
If you don't have the access to it, we will shortly find a reviewer and
assign them to your PR. -->

## Why are these changes needed?

This PR fixes two errors:
1. when the strategy is constant, we try to check if the column is
categorical, but if the column does not exists in the dataframe, it
fails with a `KeyError`
2. If the preprocessor was unable to compute statistics because all the
columns did not have any value, the preprocessor fails with an
unintuitive error in the dropna.

## Related issue number

<!-- For example: "Closes #1234" -->

## Checks

- [x] I've signed off every commit(by using the -s flag, i.e., `git
commit -s`) in this PR.
- [x] I've run `scripts/format.sh` to lint the changes in this PR.
- [ ] I've included any doc changes needed for
https://docs.ray.io/en/master/.
- [ ] I've added any new APIs to the API Reference. For example, if I
added a
method in Tune, I've added it in `doc/source/tune/api/` under the
           corresponding `.rst` file.
- [ ] I've made sure the tests are passing. Note that there might be a
few flaky tests, see the recent failures at https://flakey-tests.ray.io/
- Testing Strategy
   - [ ] Unit tests
   - [ ] Release tests
   - [ ] This PR is not tested :(

Signed-off-by: Martin Bomio <martinbomio@spotify.com>
  • Loading branch information
martinbomio authored Jan 6, 2025
1 parent 951fc14 commit f4c4c81
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 19 deletions.
45 changes: 26 additions & 19 deletions python/ray/data/preprocessors/imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,31 +117,38 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
return self

def _transform_pandas(self, df: pd.DataFrame):
if self.strategy == "mean":
new_values = {
column: self.stats_[f"mean({column})"] for column in self.columns
}
elif self.strategy == "most_frequent":
new_values = {
column: self.stats_[f"most_frequent({column})"]
for column in self.columns
}
elif self.strategy == "constant":
new_values = {column: self.fill_value for column in self.columns}
for column, value in new_values.items():
if is_categorical_dtype(df.dtypes[column]):
df[column] = df[column].cat.add_categories(value)
for column in self.columns:
value = self._get_fill_value(column)

if value is None:
raise ValueError(
f"Column {column} has no fill value. "
"Check the data used to fit the SimpleImputer."
)

for column_name in new_values:
if column_name not in df.columns:
if column not in df.columns:
# Create the column with the fill_value if it doesn't exist
df[column_name] = new_values[column_name]
df[column] = value
else:
# Fill NaN (empty) values in the existing column with the fill_value
df[column_name].fillna(new_values[column_name], inplace=True)
if is_categorical_dtype(df.dtypes[column]):
df[column] = df[column].cat.add_categories([value])
df[column].fillna(value, inplace=True)

return df

def _get_fill_value(self, column):
if self.strategy == "mean":
return self.stats_[f"mean({column})"]
elif self.strategy == "most_frequent":
return self.stats_[f"most_frequent({column})"]
elif self.strategy == "constant":
return self.fill_value
else:
raise ValueError(
f"Strategy {self.strategy} is not supported. "
"Supported values are: {self._valid_strategies}"
)

def __repr__(self):
return (
f"{self.__class__.__name__}(columns={self.columns!r}, "
Expand Down
46 changes: 46 additions & 0 deletions python/ray/data/tests/preprocessors/test_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,52 @@ def test_simple_imputer():
assert constant_out_df.equals(constant_expected_df)


def test_imputer_all_nan_raise_error():
data = {
"A": [np.nan, np.nan, np.nan, np.nan],
}
df = pd.DataFrame(data)
dataset = ray.data.from_pandas(df)

imputer = SimpleImputer(columns=["A"], strategy="mean")
imputer.fit(dataset)

with pytest.raises(ValueError):
imputer.transform_batch(df)


def test_imputer_constant_categorical():
data = {
"A_cat": ["one", "two", None, "four"],
}
df = pd.DataFrame(data)
df["A_cat"] = df["A_cat"].astype("category")
dataset = ray.data.from_pandas(df)

imputer = SimpleImputer(columns=["A_cat"], strategy="constant", fill_value="three")
imputer.fit(dataset)

transformed_df = imputer.transform_batch(df)

expected = {
"A_cat": ["one", "two", "three", "four"],
}

for column in data.keys():
np.testing.assert_array_equal(transformed_df[column].values, expected[column])

df = pd.DataFrame({"A": [1, 2, 3, 4]})
transformed_df = imputer.transform_batch(df)

expected = {
"A": [1, 2, 3, 4],
"A_cat": ["three", "three", "three", "three"],
}

for column in df:
np.testing.assert_array_equal(transformed_df[column].values, expected[column])


if __name__ == "__main__":
import sys

Expand Down

0 comments on commit f4c4c81

Please sign in to comment.