diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py index f1b2f5d25d29..5229287f59e7 100644 --- a/python-package/xgboost/_data_utils.py +++ b/python-package/xgboost/_data_utils.py @@ -16,6 +16,11 @@ class _ArrayLikeArg(Protocol): def __array_interface__(self) -> "ArrayInf": ... +class _CudaArrayLikeArg(Protocol): + @property + def __cuda_array_interface__(self) -> "ArrayInf": ... + + class TransformedDf(Protocol): """Protocol class for storing transformed dataframe.""" @@ -151,3 +156,12 @@ def array_interface(data: np.ndarray) -> bytes: interface = array_interface_dict(data) interface_str = bytes(json.dumps(interface), "utf-8") return interface_str + + +def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None: + "Make sure no missing value in meta data." + if ( + "mask" in data.__cuda_array_interface__ + and data.__cuda_array_interface__["mask"] is not None + ): + raise ValueError(f"Missing value is not allowed for: {field}") diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 8acad0f32eba..3ac5a6cc5376 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -29,6 +29,7 @@ array_hasobject, array_interface, array_interface_dict, + check_cudf_meta, cuda_array_interface, make_array_interface, ) @@ -1555,14 +1556,15 @@ def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> N def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None: - interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), "utf-8") - _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) + check_cudf_meta(data, field) + inf = cuda_array_interface(data) + _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), inf)) def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None: data = _transform_cupy_array(data) - interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), "utf-8") - _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) + inf = cuda_array_interface(data) + _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), inf)) def dispatch_meta_backend( @@ -1604,15 +1606,15 @@ def dispatch_meta_backend( data = _transform_dlpack(data) _meta_from_cupy_array(data, name, handle) return - if _is_cupy_alike(data): - _meta_from_cupy_array(data, name, handle) - return if _is_cudf_ser(data): _meta_from_cudf_series(data, name, handle) return if _is_cudf_df(data): _meta_from_cudf_df(data, name, handle) return + if _is_cupy_alike(data): + _meta_from_cupy_array(data, name, handle) + return if _is_modin_df(data): _meta_from_pandas_df(data, name, dtype=dtype, handle=handle) return diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py index 0d138f14b29a..5565c6bf4b3f 100644 --- a/tests/python-gpu/test_from_cudf.py +++ b/tests/python-gpu/test_from_cudf.py @@ -382,3 +382,20 @@ def test_from_cudf_iter(enable_categorical): predict = reg.predict(m) predict_with_it = reg_with_it.predict(m_it) np.testing.assert_allclose(predict_with_it, predict) + + +def test_invalid_meta() -> None: + df = cudf.DataFrame({"f0": [0, 1, 2], "f1": [2, 3, 4], "y": [None, 1, 2]}) + y = df["y"] + X = df.drop(["y"], axis=1) + with pytest.raises(ValueError, match="Missing value"): + xgb.DMatrix(X, y) + with pytest.raises(ValueError, match="Missing value"): + xgb.QuantileDMatrix(X, y) + y = X.copy() + y.iloc[0, 0] = None + # check by the cuDF->cupy converter. + with pytest.raises(ValueError, match="no nulls"): + xgb.DMatrix(X, y) + with pytest.raises(ValueError, match="no nulls"): + xgb.QuantileDMatrix(X, y)