Skip to content

Commit

Permalink
[breaking] [py] Drop support for datatable. (#11070)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Dec 9, 2024
1 parent 6f9c9ae commit a361896
Show file tree
Hide file tree
Showing 14 changed files with 0 additions and 366 deletions.
4 changes: 0 additions & 4 deletions doc/python/python_intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,6 @@ Support Matrix
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| dlpack | CPA | CPA | | CPA | FF | FF |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| datatable.Frame | T | FF | | NPA | FF | |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| datatable.Table | T | FF | | NPA | FF | |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| modin.DataFrame | NPA | FF | NPA | NPA | FF | |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
| modin.Series | NPA | FF | NPA | NPA | FF | |
Expand Down
16 changes: 0 additions & 16 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,22 +254,6 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const float *data, // NOLINT
bst_ulong nrow, bst_ulong ncol,
float missing, DMatrixHandle *out,
int nthread);
/*!
* \brief create matrix content from python data table
* \param data pointer to pointer to column data
* \param feature_stypes pointer to strings
* \param nrow number of rows
* \param ncol number columns
* \param out created dmatrix
* \param nthread number of threads (up to maximum cores available, if <=0 use all cores)
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromDT(void** data,
const char ** feature_stypes,
bst_ulong nrow,
bst_ulong ncol,
DMatrixHandle* out,
int nthread);

/*!
* \brief Create DMatrix from CUDA columnar format. (cuDF)
Expand Down
2 changes: 0 additions & 2 deletions ops/conda_env/linux_cpu_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,3 @@ dependencies:
- cloudpickle
- modin
- pyspark>=3.4.0
- pip:
- datatable
2 changes: 0 additions & 2 deletions ops/script/lint_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ class LintersPaths:
"tests/python/test_collective.py",
"tests/python/test_data_iterator.py",
"tests/python/test_dmatrix.py",
"tests/python/test_dt.py",
"tests/python/test_demos.py",
"tests/python/test_eval_metrics.py",
"tests/python/test_early_stopping.py",
Expand Down Expand Up @@ -94,7 +93,6 @@ class LintersPaths:
"python-package/",
# tests
"tests/python/test_collective.py",
"tests/python/test_dt.py",
"tests/python/test_demos.py",
"tests/python/test_data_iterator.py",
"tests/python/test_multi_target.py",
Expand Down
1 change: 0 additions & 1 deletion python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ repository = "https://github.com/dmlc/xgboost"
pandas = ["pandas>=1.2"]
scikit-learn = ["scikit-learn"]
dask = ["dask", "pandas", "distributed"]
datatable = ["datatable"]
plotting = ["graphviz", "matplotlib"]
pyspark = ["pyspark", "scikit-learn", "cloudpickle"]

Expand Down
126 changes: 0 additions & 126 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,110 +732,6 @@ def _from_pandas_series(
)


def _is_dt_df(data: DataType) -> bool:
return lazy_isinstance(data, "datatable", "Frame") or lazy_isinstance(
data, "datatable", "DataTable"
)


def _transform_dt_df(
data: DataType,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
meta: Optional[str] = None,
meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
"""Validate feature names and types if data table"""
_dt_type_mapper = {"bool": "bool", "int": "int", "real": "float"}
_dt_type_mapper2 = {"bool": "i", "int": "int", "real": "float"}
if meta and data.shape[1] > 1:
raise ValueError("DataTable for meta info cannot have multiple columns")
if meta:
meta_type = "float" if meta_type is None else meta_type
# below requires new dt version
# extract first column
data = data.to_numpy()[:, 0].astype(meta_type)
return data, None, None

data_types_names = tuple(lt.name for lt in data.ltypes)
bad_fields = [
data.names[i]
for i, type_name in enumerate(data_types_names)
if type_name not in _dt_type_mapper
]
if bad_fields:
msg = """DataFrame.types for data must be int, float or bool.
Did not expect the data types in fields """
raise ValueError(msg + ", ".join(bad_fields))

if feature_names is None and meta is None:
feature_names = data.names

# always return stypes for dt ingestion
if feature_types is not None:
raise ValueError("DataTable has own feature types, cannot pass them in.")
feature_types = np.vectorize(_dt_type_mapper2.get)(data_types_names).tolist()

return data, feature_names, feature_types


def _from_dt_df(
*,
data: DataType,
missing: Optional[FloatCompatible],
nthread: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
) -> DispatchedDataBackendReturnType:
if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df(
data=data,
feature_names=feature_names,
feature_types=feature_types,
meta=None,
meta_type=None,
)

ptrs = (ctypes.c_void_p * data.ncols)()
if hasattr(data, "internal") and hasattr(data.internal, "column"):
# datatable>0.8.0
for icol in range(data.ncols):
col = data.internal.column(icol)
ptr = col.data_pointer
ptrs[icol] = ctypes.c_void_p(ptr)
else:
# datatable<=0.8.0
from datatable.internal import (
frame_column_data_r, # pylint: disable=no-name-in-module
)

for icol in range(data.ncols):
ptrs[icol] = frame_column_data_r(data, icol)

# always return stypes for dt ingestion
feature_type_strings = (ctypes.c_char_p * data.ncols)()
for icol in range(data.ncols):
feature_type_strings[icol] = ctypes.c_char_p(
data.stypes[icol].name.encode("utf-8")
)

_warn_unused_missing(data, missing)
handle = ctypes.c_void_p()
_check_call(
_LIB.XGDMatrixCreateFromDT(
ptrs,
feature_type_strings,
c_bst_ulong(data.shape[0]),
c_bst_ulong(data.shape[1]),
ctypes.byref(handle),
ctypes.c_int(nthread),
)
)
return handle, feature_names, feature_types


def _is_arrow(data: DataType) -> bool:
return lazy_isinstance(data, "pyarrow.lib", "Table") or lazy_isinstance(
data, "pyarrow._dataset", "Dataset"
Expand Down Expand Up @@ -1297,16 +1193,6 @@ def dispatch_data_backend(
raise TypeError("cupyx CSC is not supported yet.")
if _is_dlpack(data):
return _from_dlpack(data, missing, threads, feature_names, feature_types)
if _is_dt_df(data):
_warn_unused_missing(data, missing)
return _from_dt_df(
data=data,
missing=missing,
nthread=threads,
feature_names=feature_names,
feature_types=feature_types,
enable_categorical=enable_categorical,
)
if _is_modin_df(data):
return _from_pandas_df(
data=data,
Expand Down Expand Up @@ -1409,15 +1295,6 @@ def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))


def _meta_from_dt(
data: DataType, field: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p
) -> None:
data, _, _ = _transform_dt_df(
data=data, feature_names=None, feature_types=None, meta=field, meta_type=dtype
)
_meta_from_numpy(data, field, dtype, handle)


def dispatch_meta_backend(
matrix: DMatrix, data: DataType, name: str, dtype: Optional[NumpyDType] = None
) -> None:
Expand Down Expand Up @@ -1459,9 +1336,6 @@ def dispatch_meta_backend(
if _is_cudf_df(data):
_meta_from_cudf_df(data, name, handle)
return
if _is_dt_df(data):
_meta_from_dt(data, name, dtype, handle)
return
if _is_modin_df(data):
_meta_from_pandas_df(data, name, dtype=dtype, handle=handle)
return
Expand Down
4 changes: 0 additions & 4 deletions python-package/xgboost/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,6 @@ def no_modin() -> PytestSkip:
return {"reason": "Failed import modin.", "condition": True}


def no_dt() -> PytestSkip:
return no_mod("datatable")


def no_matplotlib() -> PytestSkip:
reason = "Matplotlib is not installed."
try:
Expand Down
11 changes: 0 additions & 11 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -583,17 +583,6 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data, // NOLINT
API_END();
}

XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
xgboost::bst_ulong nrow,
xgboost::bst_ulong ncol, DMatrixHandle* out,
int nthread) {
API_BEGIN();
data::DataTableAdapter adapter(data, feature_stypes, nrow, ncol);
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), nthread));
API_END();
}

XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle, const int *idxset, xgboost::bst_ulong len,
DMatrixHandle *out) {
xgboost_CHECK_C_ARG_PTR(out);
Expand Down
1 change: 0 additions & 1 deletion src/common/quantile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ void HostSketchContainer::PushAdapterBatch(Batch const &batch, size_t base_rowid
INSTANTIATE(ArrayAdapterBatch)
INSTANTIATE(CSRArrayAdapterBatch)
INSTANTIATE(CSCAdapterBatch)
INSTANTIATE(DataTableAdapterBatch)
INSTANTIATE(SparsePageAdapterBatch)
INSTANTIATE(ColumnarAdapterBatch)

Expand Down
125 changes: 0 additions & 125 deletions src/data/adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -536,131 +536,6 @@ class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch>
[[nodiscard]] const CSCArrayAdapterBatch& Value() const override { return batch_; }
};

class DataTableAdapterBatch : public detail::NoMetaInfo {
enum class DTType : std::uint8_t {
kFloat32 = 0,
kFloat64 = 1,
kBool8 = 2,
kInt32 = 3,
kInt8 = 4,
kInt16 = 5,
kInt64 = 6,
kUnknown = 7
};

static DTType DTGetType(std::string type_string) {
if (type_string == "float32") {
return DTType::kFloat32;
} else if (type_string == "float64") {
return DTType::kFloat64;
} else if (type_string == "bool8") {
return DTType::kBool8;
} else if (type_string == "int32") {
return DTType::kInt32;
} else if (type_string == "int8") {
return DTType::kInt8;
} else if (type_string == "int16") {
return DTType::kInt16;
} else if (type_string == "int64") {
return DTType::kInt64;
} else {
LOG(FATAL) << "Unknown data table type.";
return DTType::kUnknown;
}
}

public:
DataTableAdapterBatch(void const* const* const data, char const* const* feature_stypes,
std::size_t num_rows, std::size_t num_features)
: data_(data), num_rows_(num_rows) {
CHECK(feature_types_.empty());
std::transform(feature_stypes, feature_stypes + num_features,
std::back_inserter(feature_types_),
[](char const* stype) { return DTGetType(stype); });
}

private:
class Line {
std::size_t row_idx_;
void const* const* const data_;
std::vector<DTType> const& feature_types_;

float DTGetValue(void const* column, DTType dt_type, std::size_t ridx) const {
float missing = std::numeric_limits<float>::quiet_NaN();
switch (dt_type) {
case DTType::kFloat32: {
float val = reinterpret_cast<const float*>(column)[ridx];
return std::isfinite(val) ? val : missing;
}
case DTType::kFloat64: {
double val = reinterpret_cast<const double*>(column)[ridx];
return std::isfinite(val) ? static_cast<float>(val) : missing;
}
case DTType::kBool8: {
bool val = reinterpret_cast<const bool*>(column)[ridx];
return static_cast<float>(val);
}
case DTType::kInt32: {
int32_t val = reinterpret_cast<const int32_t*>(column)[ridx];
return val != (-2147483647 - 1) ? static_cast<float>(val) : missing;
}
case DTType::kInt8: {
int8_t val = reinterpret_cast<const int8_t*>(column)[ridx];
return val != -128 ? static_cast<float>(val) : missing;
}
case DTType::kInt16: {
int16_t val = reinterpret_cast<const int16_t*>(column)[ridx];
return val != -32768 ? static_cast<float>(val) : missing;
}
case DTType::kInt64: {
int64_t val = reinterpret_cast<const int64_t*>(column)[ridx];
return val != -9223372036854775807 - 1 ? static_cast<float>(val) : missing;
}
default: {
LOG(FATAL) << "Unknown data table type.";
return 0.0f;
}
}
}

public:
Line(std::size_t ridx, void const* const* const data, std::vector<DTType> const& ft)
: row_idx_{ridx}, data_{data}, feature_types_{ft} {}
[[nodiscard]] std::size_t Size() const { return feature_types_.size(); }
[[nodiscard]] COOTuple GetElement(std::size_t idx) const {
return COOTuple{row_idx_, idx, DTGetValue(data_[idx], feature_types_[idx], row_idx_)};
}
};

public:
[[nodiscard]] size_t Size() const { return num_rows_; }
[[nodiscard]] const Line GetLine(std::size_t ridx) const { return {ridx, data_, feature_types_}; }
static constexpr bool kIsRowMajor = true;

private:
void const* const* const data_;

std::vector<DTType> feature_types_;
std::size_t num_rows_;
};

class DataTableAdapter : public detail::SingleBatchDataIter<DataTableAdapterBatch> {
public:
DataTableAdapter(void** data, const char** feature_stypes, std::size_t num_rows,
std::size_t num_features)
: batch_(data, feature_stypes, num_rows, num_features),
num_rows_(num_rows),
num_columns_(num_features) {}
[[nodiscard]] const DataTableAdapterBatch& Value() const override { return batch_; }
[[nodiscard]] std::size_t NumRows() const { return num_rows_; }
[[nodiscard]] std::size_t NumColumns() const { return num_columns_; }

private:
DataTableAdapterBatch batch_;
std::size_t num_rows_;
std::size_t num_columns_;
};

class ColumnarAdapterBatch : public detail::NoMetaInfo {
common::Span<ArrayInterface<1, false>> columns_;

Expand Down
Loading

0 comments on commit a361896

Please sign in to comment.