[AIR] Maintain dtype info in LightGBMPredictor (#28673)

We always convert to numpy and then back to dataframe in `LightGBMPredictor`, and try to infer dtypes in between. This is imprecise and allows for an edge case where a Categorical column composed of integers is classified as an int column, and it also decreases performance. This PR keeps dtype information if possible by not converting to numpy unnecessarily. The inference logic is still present for the tensor column case - I am not familiar enough with it to fix it here (if it needs fixing in the first place). Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
ray-project · Sep 22, 2022 · b7f0346 · b7f0346
1 parent f6ae7ee
commit b7f0346
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 27 deletions.
diff --git a/python/ray/train/lightgbm/lightgbm_predictor.py b/python/ray/train/lightgbm/lightgbm_predictor.py
@@ -1,8 +1,8 @@
-import numpy as np
 from typing import TYPE_CHECKING, List, Optional, Union
 
 import lightgbm
 import pandas as pd
+from pandas.api.types import is_object_dtype
 
 from ray.air.checkpoint import Checkpoint
 from ray.air.constants import TENSOR_COLUMN_NAME
@@ -129,33 +129,25 @@ def _predict_pandas(
             if feature_columns:
                 # In this case feature_columns is a list of integers
                 data = data[:, feature_columns]
+            # Turn into dataframe to make dtype resolution easy
+            data = pd.DataFrame(data, columns=feature_names)
+            data = data.infer_objects()
+
+            # Pandas does not detect categorical dtypes. Any remaining object
+            # dtypes are probably categories, so convert them.
+            # This will fail if we have a category composed entirely of
+            # integers, but this is the best we can do here.
+            update_dtypes = {}
+            for column in data.columns:
+                dtype = data.dtypes[column]
+                if is_object_dtype(dtype):
+                    update_dtypes[column] = pd.CategoricalDtype()
+
+            if update_dtypes:
+                data = data.astype(update_dtypes, copy=False)
         elif feature_columns:
             # feature_columns is a list of integers or strings
-            data = data[feature_columns].to_numpy()
-            # Only set the feature names if they are strings
-            if all(isinstance(fc, str) for fc in feature_columns):
-                feature_names = feature_columns
-        else:
-            feature_columns = data.columns.tolist()
-            data = data.to_numpy()
-
-            if all(isinstance(fc, str) for fc in feature_columns):
-                feature_names = feature_columns
-
-        # Turn into dataframe to make dtype resolution easy
-        data = pd.DataFrame(data, columns=feature_names)
-        data = data.infer_objects()
-
-        # Pandas does not detect categorical dtypes. Any remaining object
-        # dtypes are probably categories, so convert them.
-        update_dtypes = {}
-        for column in data.columns:
-            dtype = data.dtypes[column]
-            if dtype == np.object:
-                update_dtypes[column] = pd.CategoricalDtype()
-
-        if update_dtypes:
-            data = data.astype(update_dtypes, copy=False)
+            data = data[feature_columns]
 
         df = pd.DataFrame(self.model.predict(data, **predict_kwargs))
         df.columns = (

diff --git a/python/ray/train/tests/test_lightgbm_predictor.py b/python/ray/train/tests/test_lightgbm_predictor.py
@@ -120,6 +120,30 @@ def test_predict_feature_columns_pandas():
     assert predictor.get_preprocessor().has_preprocessed
 
 
+@pytest.mark.parametrize("to_string", [True, False])
+def test_predict_feature_columns_pandas_categorical(to_string: bool):
+    pandas_data = pd.DataFrame(dummy_data, columns=["A", "B"])
+    if to_string:
+        pandas_data["A"] = [str(x) for x in pandas_data["A"]]
+    pandas_data["A"] = pandas_data["A"].astype("category")
+    pandas_target = pd.Series(dummy_target)
+    pandas_model = (
+        lgbm.LGBMClassifier(n_estimators=10).fit(pandas_data, pandas_target).booster_
+    )
+    preprocessor = DummyPreprocessor()
+    predictor = LightGBMPredictor(model=pandas_model, preprocessor=preprocessor)
+    data_batch = pd.DataFrame(
+        np.array([[1, 2, 2], [3, 4, 8], [5, 6, 9]]), columns=["A", "B", "C"]
+    )
+    if to_string:
+        data_batch["A"] = [str(x) for x in data_batch["A"]]
+    data_batch["A"] = data_batch["A"].astype("category")
+    predictions = predictor.predict(data_batch, feature_columns=["A", "B"])
+
+    assert len(predictions) == 3
+    assert predictor.get_preprocessor().has_preprocessed
+
+
 def test_predict_no_preprocessor_no_training():
     checkpoint = LightGBMCheckpoint.from_model(booster=model)
     predictor = LightGBMPredictor.from_checkpoint(checkpoint)

diff --git a/python/ray/train/xgboost/xgboost_predictor.py b/python/ray/train/xgboost/xgboost_predictor.py
@@ -1,14 +1,14 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import pandas as pd
-from ray.train.xgboost.xgboost_checkpoint import XGBoostCheckpoint
 import xgboost
 
 from ray.air.checkpoint import Checkpoint
 from ray.air.constants import TENSOR_COLUMN_NAME
 from ray.air.data_batch_type import DataBatchType
 from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
 from ray.train.predictor import Predictor
+from ray.train.xgboost.xgboost_checkpoint import XGBoostCheckpoint
 from ray.util.annotations import PublicAPI
 
 if TYPE_CHECKING: