Skip to content

Commit

Permalink
Check for invalid cuDF input. (#11248)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Feb 20, 2025
1 parent ce4fc7f commit e5f29dd
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 7 deletions.
14 changes: 14 additions & 0 deletions python-package/xgboost/_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ class _ArrayLikeArg(Protocol):
def __array_interface__(self) -> "ArrayInf": ...


class _CudaArrayLikeArg(Protocol):
@property
def __cuda_array_interface__(self) -> "ArrayInf": ...


class TransformedDf(Protocol):
"""Protocol class for storing transformed dataframe."""

Expand Down Expand Up @@ -151,3 +156,12 @@ def array_interface(data: np.ndarray) -> bytes:
interface = array_interface_dict(data)
interface_str = bytes(json.dumps(interface), "utf-8")
return interface_str


def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None:
"Make sure no missing value in meta data."
if (
"mask" in data.__cuda_array_interface__
and data.__cuda_array_interface__["mask"] is not None
):
raise ValueError(f"Missing value is not allowed for: {field}")
16 changes: 9 additions & 7 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
array_hasobject,
array_interface,
array_interface_dict,
check_cudf_meta,
cuda_array_interface,
make_array_interface,
)
Expand Down Expand Up @@ -1555,14 +1556,15 @@ def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> N


def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), "utf-8")
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))
check_cudf_meta(data, field)
inf = cuda_array_interface(data)
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), inf))


def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
data = _transform_cupy_array(data)
interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), "utf-8")
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))
inf = cuda_array_interface(data)
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), inf))


def dispatch_meta_backend(
Expand Down Expand Up @@ -1604,15 +1606,15 @@ def dispatch_meta_backend(
data = _transform_dlpack(data)
_meta_from_cupy_array(data, name, handle)
return
if _is_cupy_alike(data):
_meta_from_cupy_array(data, name, handle)
return
if _is_cudf_ser(data):
_meta_from_cudf_series(data, name, handle)
return
if _is_cudf_df(data):
_meta_from_cudf_df(data, name, handle)
return
if _is_cupy_alike(data):
_meta_from_cupy_array(data, name, handle)
return
if _is_modin_df(data):
_meta_from_pandas_df(data, name, dtype=dtype, handle=handle)
return
Expand Down
17 changes: 17 additions & 0 deletions tests/python-gpu/test_from_cudf.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,3 +382,20 @@ def test_from_cudf_iter(enable_categorical):
predict = reg.predict(m)
predict_with_it = reg_with_it.predict(m_it)
np.testing.assert_allclose(predict_with_it, predict)


def test_invalid_meta() -> None:
df = cudf.DataFrame({"f0": [0, 1, 2], "f1": [2, 3, 4], "y": [None, 1, 2]})
y = df["y"]
X = df.drop(["y"], axis=1)
with pytest.raises(ValueError, match="Missing value"):
xgb.DMatrix(X, y)
with pytest.raises(ValueError, match="Missing value"):
xgb.QuantileDMatrix(X, y)
y = X.copy()
y.iloc[0, 0] = None
# check by the cuDF->cupy converter.
with pytest.raises(ValueError, match="no nulls"):
xgb.DMatrix(X, y)
with pytest.raises(ValueError, match="no nulls"):
xgb.QuantileDMatrix(X, y)

0 comments on commit e5f29dd

Please sign in to comment.