Skip to content

Commit

Permalink
[AIR] Maintain dtype info in LightGBMPredictor (#28673)
Browse files Browse the repository at this point in the history
We always convert to numpy and then back to dataframe in `LightGBMPredictor`, and try to infer dtypes in between. This is imprecise and allows for an edge case where a Categorical column composed of integers is classified as an int column, and it also decreases performance. This PR keeps dtype information if possible by not converting to numpy unnecessarily. The inference logic is still present for the tensor column case - I am not familiar enough with it to fix it here (if it needs fixing in the first place).

Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
  • Loading branch information
Yard1 authored Sep 22, 2022
1 parent f6ae7ee commit b7f0346
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 27 deletions.
44 changes: 18 additions & 26 deletions python/ray/train/lightgbm/lightgbm_predictor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import numpy as np
from typing import TYPE_CHECKING, List, Optional, Union

import lightgbm
import pandas as pd
from pandas.api.types import is_object_dtype

from ray.air.checkpoint import Checkpoint
from ray.air.constants import TENSOR_COLUMN_NAME
Expand Down Expand Up @@ -129,33 +129,25 @@ def _predict_pandas(
if feature_columns:
# In this case feature_columns is a list of integers
data = data[:, feature_columns]
# Turn into dataframe to make dtype resolution easy
data = pd.DataFrame(data, columns=feature_names)
data = data.infer_objects()

# Pandas does not detect categorical dtypes. Any remaining object
# dtypes are probably categories, so convert them.
# This will fail if we have a category composed entirely of
# integers, but this is the best we can do here.
update_dtypes = {}
for column in data.columns:
dtype = data.dtypes[column]
if is_object_dtype(dtype):
update_dtypes[column] = pd.CategoricalDtype()

if update_dtypes:
data = data.astype(update_dtypes, copy=False)
elif feature_columns:
# feature_columns is a list of integers or strings
data = data[feature_columns].to_numpy()
# Only set the feature names if they are strings
if all(isinstance(fc, str) for fc in feature_columns):
feature_names = feature_columns
else:
feature_columns = data.columns.tolist()
data = data.to_numpy()

if all(isinstance(fc, str) for fc in feature_columns):
feature_names = feature_columns

# Turn into dataframe to make dtype resolution easy
data = pd.DataFrame(data, columns=feature_names)
data = data.infer_objects()

# Pandas does not detect categorical dtypes. Any remaining object
# dtypes are probably categories, so convert them.
update_dtypes = {}
for column in data.columns:
dtype = data.dtypes[column]
if dtype == np.object:
update_dtypes[column] = pd.CategoricalDtype()

if update_dtypes:
data = data.astype(update_dtypes, copy=False)
data = data[feature_columns]

df = pd.DataFrame(self.model.predict(data, **predict_kwargs))
df.columns = (
Expand Down
24 changes: 24 additions & 0 deletions python/ray/train/tests/test_lightgbm_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,30 @@ def test_predict_feature_columns_pandas():
assert predictor.get_preprocessor().has_preprocessed


@pytest.mark.parametrize("to_string", [True, False])
def test_predict_feature_columns_pandas_categorical(to_string: bool):
pandas_data = pd.DataFrame(dummy_data, columns=["A", "B"])
if to_string:
pandas_data["A"] = [str(x) for x in pandas_data["A"]]
pandas_data["A"] = pandas_data["A"].astype("category")
pandas_target = pd.Series(dummy_target)
pandas_model = (
lgbm.LGBMClassifier(n_estimators=10).fit(pandas_data, pandas_target).booster_
)
preprocessor = DummyPreprocessor()
predictor = LightGBMPredictor(model=pandas_model, preprocessor=preprocessor)
data_batch = pd.DataFrame(
np.array([[1, 2, 2], [3, 4, 8], [5, 6, 9]]), columns=["A", "B", "C"]
)
if to_string:
data_batch["A"] = [str(x) for x in data_batch["A"]]
data_batch["A"] = data_batch["A"].astype("category")
predictions = predictor.predict(data_batch, feature_columns=["A", "B"])

assert len(predictions) == 3
assert predictor.get_preprocessor().has_preprocessed


def test_predict_no_preprocessor_no_training():
checkpoint = LightGBMCheckpoint.from_model(booster=model)
predictor = LightGBMPredictor.from_checkpoint(checkpoint)
Expand Down
2 changes: 1 addition & 1 deletion python/ray/train/xgboost/xgboost_predictor.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union

import pandas as pd
from ray.train.xgboost.xgboost_checkpoint import XGBoostCheckpoint
import xgboost

from ray.air.checkpoint import Checkpoint
from ray.air.constants import TENSOR_COLUMN_NAME
from ray.air.data_batch_type import DataBatchType
from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
from ray.train.predictor import Predictor
from ray.train.xgboost.xgboost_checkpoint import XGBoostCheckpoint
from ray.util.annotations import PublicAPI

if TYPE_CHECKING:
Expand Down

0 comments on commit b7f0346

Please sign in to comment.