diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 0c754317185..2199d4d5ba5 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -106,19 +106,6 @@ def _check_and_cast_columns_with_other( return _normalize_categorical(source_col.astype(common_dtype), other) -def _make_categorical_like(result, column): - if isinstance(column, cudf.core.column.CategoricalColumn): - result = cudf.core.column.build_categorical_column( - categories=column.categories, - codes=result, - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=column.ordered, - ) - return result - - def _can_cast(from_dtype, to_dtype): """ Utility function to determine if we can cast diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index e7119fcdf47..5781d77ee9a 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -8,7 +8,6 @@ from cudf.core.column.column import ( ColumnBase, as_column, - build_categorical_column, build_column, column_empty, column_empty_like, diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a7e98e5218f..de5ed15771d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -52,6 +52,15 @@ _DEFAULT_CATEGORICAL_VALUE = np.int8(-1) +def as_unsigned_codes( + num_cats: int, codes: NumericalColumn +) -> NumericalColumn: + codes_dtype = min_unsigned_type(num_cats) + return cast( + cudf.core.column.numerical.NumericalColumn, codes.astype(codes_dtype) + ) + + class CategoricalAccessor(ColumnMethods): """ Accessor object for categorical properties of the Series values. @@ -637,13 +646,12 @@ def __setitem__(self, key, value): value = value.codes codes = self.codes codes[key] = value - out = cudf.core.column.build_categorical_column( - categories=self.categories, - codes=codes, - mask=codes.base_mask, + out = type(self)( + data=self.data, size=codes.size, - offset=self.offset, - ordered=self.ordered, + dtype=self.dtype, + mask=codes.base_mask, + children=(codes,), ) self._mimic_inplace(out, inplace=True) @@ -669,16 +677,13 @@ def _fill( def slice(self, start: int, stop: int, stride: int | None = None) -> Self: codes = self.codes.slice(start, stop, stride) - return cast( - Self, - cudf.core.column.build_categorical_column( - categories=self.categories, - codes=codes, - mask=codes.base_mask, - ordered=self.ordered, - size=codes.size, - offset=codes.offset, - ), + return type(self)( + data=self.data, # type: ignore[arg-type] + size=codes.size, + dtype=self.dtype, + mask=codes.base_mask, + offset=codes.offset, + children=(codes,), ) def _reduce( @@ -719,7 +724,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: ) return self.codes._binaryop(other.codes, op) - def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: + def normalize_binop_value(self, other: ScalarLike) -> Self: if isinstance(other, column.ColumnBase): if not isinstance(other, CategoricalColumn): return NotImplemented @@ -727,30 +732,27 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: raise TypeError( "Categoricals can only compare with the same type" ) - return other - - ary = column.as_column( + return cast(Self, other) + codes = column.as_column( self._encode(other), length=len(self), dtype=self.codes.dtype ) - return column.build_categorical_column( - categories=self.dtype.categories._values, - codes=column.as_column(ary), + return type(self)( + data=None, + size=self.size, + dtype=self.dtype, mask=self.base_mask, - ordered=self.dtype.ordered, + children=(codes,), # type: ignore[arg-type] ) - def sort_values( - self, ascending: bool = True, na_position="last" - ) -> CategoricalColumn: + def sort_values(self, ascending: bool = True, na_position="last") -> Self: codes = self.codes.sort_values(ascending, na_position) - col = column.build_categorical_column( - categories=self.dtype.categories._values, - codes=codes, - mask=codes.base_mask, + return type(self)( + data=self.data, # type: ignore[arg-type] size=codes.size, - ordered=self.dtype.ordered, + dtype=self.dtype, + mask=codes.base_mask, + children=(codes,), ) - return col def element_indexing(self, index: int) -> ScalarLike: val = self.codes.element_indexing(index) @@ -777,12 +779,12 @@ def to_pandas( if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) - col = column.build_categorical_column( - categories=self.categories, - codes=column.as_column(self.codes, dtype=self.codes.dtype), + col = type(self)( + data=self.data, # type: ignore[arg-type] + size=self.size, + dtype=self.dtype, mask=new_mask, - ordered=self.dtype.ordered, - size=self.codes.size, + children=self.children, ) else: col = self @@ -849,15 +851,15 @@ def data_array_view( ) -> numba.cuda.devicearray.DeviceNDArray: return self.codes.data_array_view(mode=mode) - def unique(self) -> CategoricalColumn: + def unique(self) -> Self: codes = self.codes.unique() - return column.build_categorical_column( - categories=self.categories, - codes=codes, + return type(self)( + data=self.data, # type: ignore[arg-type] + size=codes.size, + dtype=self.dtype, mask=codes.base_mask, offset=codes.offset, - size=codes.size, - ordered=self.ordered, + children=(codes,), ) def _encode(self, value) -> ScalarLike: @@ -988,14 +990,17 @@ def find_and_replace( output = libcudf.replace.replace( replaced_codes, to_replace_col, replacement_col ) + codes = as_unsigned_codes(len(new_cats["cats"]), output) - result = column.build_categorical_column( - categories=new_cats["cats"], - codes=output, - mask=output.base_mask, - offset=output.offset, - size=output.size, - ordered=self.dtype.ordered, + result = type(self)( + data=self.data, # type: ignore[arg-type] + size=codes.size, + dtype=CategoricalDtype( + categories=new_cats["cats"], ordered=self.dtype.ordered + ), + mask=codes.base_mask, + offset=codes.offset, + children=(codes,), ) if result.dtype != self.dtype: warnings.warn( @@ -1082,7 +1087,7 @@ def is_monotonic_increasing(self) -> bool: def is_monotonic_decreasing(self) -> bool: return bool(self.ordered) and self.codes.is_monotonic_decreasing - def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: + def as_categorical_column(self, dtype: Dtype) -> Self: if isinstance(dtype, str) and dtype == "category": return self if isinstance(dtype, pd.CategoricalDtype): @@ -1099,7 +1104,23 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if not isinstance(self.categories, type(dtype.categories._column)): # If both categories are of different Column types, # return a column full of Nulls. - return _create_empty_categorical_column(self, dtype) + codes = cast( + cudf.core.column.numerical.NumericalColumn, + column.as_column( + _DEFAULT_CATEGORICAL_VALUE, + length=self.size, + dtype=self.codes.dtype, + ), + ) + codes = as_unsigned_codes(len(dtype.categories), codes) + return type(self)( + data=self.data, # type: ignore[arg-type] + size=self.size, + dtype=dtype, + mask=self.base_mask, + offset=self.offset, + children=(codes,), + ) return self.set_categories( new_categories=dtype.categories, ordered=bool(dtype.ordered) @@ -1185,26 +1206,29 @@ def _concat( codes = [o for o in codes if len(o)] codes_col = libcudf.concat.concat_columns(objs) - return column.build_categorical_column( - categories=column.as_column(cats), - codes=codes_col, - mask=codes_col.base_mask, + codes_col = as_unsigned_codes( + len(cats), + cast(cudf.core.column.numerical.NumericalColumn, codes_col), + ) + return CategoricalColumn( + data=None, size=codes_col.size, + dtype=CategoricalDtype(categories=cats), + mask=codes_col.base_mask, offset=codes_col.offset, + children=(codes_col,), # type: ignore[arg-type] ) - def _with_type_metadata( - self: CategoricalColumn, dtype: Dtype - ) -> CategoricalColumn: + def _with_type_metadata(self: Self, dtype: Dtype) -> Self: if isinstance(dtype, CategoricalDtype): - return column.build_categorical_column( - categories=dtype.categories._values, - codes=self.codes, - mask=self.codes.base_mask, - ordered=dtype.ordered, + return type(self)( + data=self.data, # type: ignore[arg-type] size=self.codes.size, + dtype=dtype, + mask=self.codes.base_mask, offset=self.codes.offset, null_count=self.codes.null_count, + children=(self.codes,), ) return self @@ -1213,7 +1237,7 @@ def set_categories( new_categories: Any, ordered: bool = False, rename: bool = False, - ) -> CategoricalColumn: + ) -> Self: # See CategoricalAccessor.set_categories. ordered = ordered if ordered is not None else self.ordered @@ -1232,25 +1256,39 @@ def set_categories( "new_categories must have the same " "number of items as old categories" ) - - out_col = column.build_categorical_column( - categories=new_categories, - codes=self.base_children[0], - mask=self.base_mask, + out_col = type(self)( + data=self.data, # type: ignore[arg-type] size=self.size, + dtype=CategoricalDtype( + categories=new_categories, ordered=ordered + ), + mask=self.base_mask, offset=self.offset, - ordered=ordered, + children=(self.codes,), ) else: out_col = self if type(out_col.categories) is not type(new_categories): # If both categories are of different Column types, # return a column full of Nulls. - out_col = _create_empty_categorical_column( - self, - CategoricalDtype( + new_codes = cast( + cudf.core.column.numerical.NumericalColumn, + column.as_column( + _DEFAULT_CATEGORICAL_VALUE, + length=self.size, + dtype=self.codes.dtype, + ), + ) + new_codes = as_unsigned_codes(len(new_categories), new_codes) + out_col = type(self)( + data=self.data, # type: ignore[arg-type] + size=self.size, + dtype=CategoricalDtype( categories=new_categories, ordered=ordered ), + mask=self.base_mask, + offset=self.offset, + children=(new_codes,), ) elif ( not out_col._categories_equal(new_categories, ordered=True) @@ -1335,19 +1373,19 @@ def _set_categories( df.reset_index(drop=True, inplace=True) ordered = ordered if ordered is not None else self.ordered - new_codes = df._data["new_codes"] + new_codes = cast( + cudf.core.column.numerical.NumericalColumn, df._data["new_codes"] + ) # codes can't have masks, so take mask out before moving in - return cast( - Self, - column.build_categorical_column( - categories=new_cats, - codes=new_codes, - mask=new_codes.base_mask, - size=new_codes.size, - offset=new_codes.offset, - ordered=ordered, - ), + new_codes = as_unsigned_codes(len(new_cats), new_codes) + return type(self)( + data=self.data, # type: ignore[arg-type] + size=new_codes.size, + dtype=CategoricalDtype(categories=new_cats, ordered=ordered), + mask=new_codes.base_mask, + offset=new_codes.offset, + children=(new_codes,), ) def add_categories(self, new_categories: Any) -> Self: @@ -1425,56 +1463,16 @@ def remove_unused_categories(self) -> Self: "remove_unused_categories is currently not supported." ) - def as_ordered(self, ordered: bool): + def as_ordered(self, ordered: bool) -> Self: if self.dtype.ordered == ordered: return self - return column.build_categorical_column( - categories=self.categories, - codes=self.codes, - mask=self.base_mask, + return type(self)( + data=self.data, # type: ignore[arg-type] size=self.size, + dtype=CategoricalDtype( + categories=self.categories, ordered=ordered + ), + mask=self.base_mask, offset=self.offset, - ordered=ordered, + children=self.children, ) - - -def _create_empty_categorical_column( - categorical_column: CategoricalColumn, dtype: "CategoricalDtype" -) -> CategoricalColumn: - return column.build_categorical_column( - categories=column.as_column(dtype.categories), - codes=column.as_column( - _DEFAULT_CATEGORICAL_VALUE, - length=categorical_column.size, - dtype=categorical_column.codes.dtype, - ), - offset=categorical_column.offset, - size=categorical_column.size, - mask=categorical_column.base_mask, - ordered=dtype.ordered, - ) - - -def pandas_categorical_as_column( - categorical: ColumnLike, codes: ColumnLike | None = None -) -> CategoricalColumn: - """Creates a CategoricalColumn from a pandas.Categorical - - If ``codes`` is defined, use it instead of ``categorical.codes`` - """ - codes = categorical.codes if codes is None else codes - codes = column.as_column(codes) - - valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE) - - mask = None - if not valid_codes.all(): - mask = bools_to_mask(valid_codes) - - return column.build_categorical_column( - categories=categorical.categories, - codes=codes, - size=codes.size, - mask=mask, - ordered=categorical.ordered, - ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 60b4126ddd4..885476a897c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -352,13 +352,17 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: codes = libcudf.interop.from_arrow(indices_table)[0] categories = libcudf.interop.from_arrow(dictionaries_table)[0] - - return build_categorical_column( - categories=categories, - codes=codes, - mask=codes.base_mask, + codes = cudf.core.column.categorical.as_unsigned_codes( + len(categories), codes + ) + return cudf.core.column.CategoricalColumn( + data=None, size=codes.size, - ordered=array.type.ordered, + dtype=CategoricalDtype( + categories=categories, ordered=array.type.ordered + ), + mask=codes.base_mask, + children=(codes,), ) result = libcudf.interop.from_arrow(data)[0] @@ -950,10 +954,10 @@ def is_monotonic_decreasing(self) -> bool: ) def sort_values( - self: ColumnBase, + self: Self, ascending: bool = True, na_position: str = "last", - ) -> ColumnBase: + ) -> Self: if (not ascending and self.is_monotonic_decreasing) or ( ascending and self.is_monotonic_increasing ): @@ -1041,12 +1045,16 @@ def as_categorical_column(self, dtype) -> ColumnBase: and dtype._categories is not None ): cat_col = dtype._categories - labels = self._label_encoding(cats=cat_col) - return build_categorical_column( - categories=cat_col, - codes=labels, + codes = self._label_encoding(cats=cat_col) + codes = cudf.core.column.categorical.as_unsigned_codes( + len(cat_col), codes + ) + return cudf.core.column.categorical.CategoricalColumn( + data=None, + size=None, + dtype=dtype, mask=self.mask, - ordered=dtype.ordered, + children=(codes,), ) # Categories must be unique and sorted in ascending order. @@ -1058,15 +1066,16 @@ def as_categorical_column(self, dtype) -> ColumnBase: # columns include null index in factorization; remove: if self.has_nulls(): cats = cats.dropna() - min_type = min_unsigned_type(len(cats), 8) - if cudf.dtype(min_type).itemsize < labels.dtype.itemsize: - labels = labels.astype(min_type) - return build_categorical_column( - categories=cats, - codes=labels, + labels = cudf.core.column.categorical.as_unsigned_codes( + len(cats), labels + ) + return cudf.core.column.categorical.CategoricalColumn( + data=None, + size=None, + dtype=CategoricalDtype(categories=cats, ordered=ordered), mask=self.mask, - ordered=ordered, + children=(labels,), ) def as_numerical_column( @@ -1186,7 +1195,7 @@ def searchsorted( na_position=na_position, ) - def unique(self) -> ColumnBase: + def unique(self) -> Self: """ Get unique values in the data """ @@ -1695,51 +1704,6 @@ def build_column( raise TypeError(f"Unrecognized dtype: {dtype}") -def build_categorical_column( - categories: ColumnBase, - codes: ColumnBase, - mask: Buffer | None = None, - size: int | None = None, - offset: int = 0, - null_count: int | None = None, - ordered: bool = False, -) -> "cudf.core.column.CategoricalColumn": - """ - Build a CategoricalColumn - - Parameters - ---------- - categories : Column - Column of categories - codes : Column - Column of codes, the size of the resulting Column will be - the size of `codes` - mask : Buffer - Null mask - size : int, optional - offset : int, optional - ordered : bool, default False - Indicates whether the categories are ordered - """ - codes_dtype = min_unsigned_type(len(categories)) - codes = as_column(codes) - if codes.dtype != codes_dtype: - codes = codes.astype(codes_dtype) - - dtype = CategoricalDtype(categories=categories, ordered=ordered) - - result = build_column( - data=None, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=(codes,), - ) - return cast("cudf.core.column.CategoricalColumn", result) - - def check_invalid_array(shape: tuple, dtype): """Invalid ndarrays properties that are not supported""" if len(shape) > 1: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 7f391c8a79c..78d2814ed26 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -651,22 +651,20 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: return False - def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: + def _with_type_metadata(self: Self, dtype: Dtype) -> ColumnBase: if isinstance(dtype, CategoricalDtype): - return column.build_categorical_column( - categories=dtype.categories._values, - codes=cudf.core.column.NumericalColumn( - self.base_data, # type: ignore[arg-type] - self.size, - dtype=self.dtype, - ), - mask=self.base_mask, - ordered=dtype.ordered, + codes = cudf.core.column.categorical.as_unsigned_codes( + len(dtype.categories), self + ) + return cudf.core.column.CategoricalColumn( + data=None, size=self.size, + dtype=dtype, + mask=self.base_mask, offset=self.offset, null_count=self.null_count, + children=(codes,), ) - return self def to_pandas( diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index a4ceea266b4..c9b1fa2669c 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -8,7 +8,8 @@ import cudf from cudf.api.types import is_list_like -from cudf.core.column import as_column, build_categorical_column +from cudf.core.column import as_column +from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.core.index import IntervalIndex, interval_range @@ -282,13 +283,17 @@ def cut( # should allow duplicate categories. return interval_labels[index_labels] - col = build_categorical_column( - categories=interval_labels, - codes=index_labels, + index_labels = as_unsigned_codes(len(interval_labels), index_labels) + + col = CategoricalColumn( + data=None, + size=index_labels.size, + dtype=cudf.CategoricalDtype( + categories=interval_labels, ordered=ordered + ), mask=index_labels.base_mask, offset=index_labels.offset, - size=index_labels.size, - ordered=ordered, + children=(index_labels,), ) # we return a categorical index, as we don't have a Categorical method diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6065e0e1eeb..0d632f4775f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -48,10 +48,10 @@ ColumnBase, StructColumn, as_column, - build_categorical_column, column_empty, concat_columns, ) +from cudf.core.column.categorical import as_unsigned_codes from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template @@ -3067,7 +3067,6 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): from cudf.core._internals.where import ( _check_and_cast_columns_with_other, - _make_categorical_like, ) # First process the condition. @@ -3119,7 +3118,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): out = [] for (name, col), other_col in zip(self._data.items(), other_cols): - col, other_col = _check_and_cast_columns_with_other( + source_col, other_col = _check_and_cast_columns_with_other( source_col=col, other=other_col, inplace=inplace, @@ -3127,16 +3126,16 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): if cond_col := cond._data.get(name): result = cudf._lib.copying.copy_if_else( - col, other_col, cond_col + source_col, other_col, cond_col ) - out.append(_make_categorical_like(result, self._data[name])) + out.append(result._with_type_metadata(col.dtype)) else: out_mask = cudf._lib.null_mask.create_null_mask( - len(col), + len(source_col), state=cudf._lib.null_mask.MaskState.ALL_NULL, ) - out.append(col.set_mask(out_mask)) + out.append(source_col.set_mask(out_mask)) return self._mimic_inplace( self._from_data_like_self(self._data._from_columns_like_self(out)), @@ -3296,9 +3295,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): # least require a deprecation cycle because we currently support # inserting a pd.Categorical. if isinstance(value, pd.Categorical): - value = cudf.core.column.categorical.pandas_categorical_as_column( - value - ) + value = as_column(value) if _is_scalar_or_zero_d_array(value): dtype = None @@ -8510,12 +8507,16 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): def _reassign_categories(categories, cols, col_idxs): for name, idx in zip(cols, col_idxs): if idx in categories: - cols[name] = build_categorical_column( - categories=categories[idx], - codes=cols[name], - mask=cols[name].base_mask, - offset=cols[name].offset, - size=cols[name].size, + codes = as_unsigned_codes(len(categories[idx]), cols[name]) + cols[name] = CategoricalColumn( + data=None, + size=codes.size, + dtype=cudf.CategoricalDtype( + categories=categories[idx], ordered=False + ), + mask=codes.base_mask, + offset=codes.offset, + children=(codes,), ) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index a70a42c04af..5250a741d3d 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -13,7 +13,12 @@ import cudf from cudf.core.buffer import Buffer, as_buffer -from cudf.core.column import as_column, build_categorical_column, build_column +from cudf.core.column import ( + CategoricalColumn, + NumericalColumn, + as_column, + build_column, +) # Implementation of interchange protocol classes # ---------------------------------------------- @@ -830,18 +835,19 @@ def _protocol_to_cudf_column_categorical( assert buffers["data"] is not None, "data buffer should not be None" codes_buffer, codes_dtype = buffers["data"] codes_buffer = _ensure_gpu_buffer(codes_buffer, codes_dtype, allow_copy) - cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) - codes = build_column( - codes_buffer._buf, - cdtype, + cdtype = np.dtype(protocol_dtype_to_cupy_dtype(codes_dtype)) + codes = NumericalColumn( + data=codes_buffer._buf, + size=None, + dtype=cdtype, ) - - cudfcol = build_categorical_column( - categories=categories, - codes=codes, - mask=codes.base_mask, + cudfcol = CategoricalColumn( + data=None, size=codes.size, - ordered=ordered, + dtype=cudf.CategoricalDtype(categories=categories, ordered=ordered), + mask=codes.base_mask, + offset=codes.offset, + children=(codes,), ) return _set_missing_values(col, cudfcol, allow_copy), buffers diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index cbe1e97d834..7b2bc85b13b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -24,10 +24,10 @@ from cudf.core.column import ( ColumnBase, as_column, - build_categorical_column, deserialize_columns, serialize_columns, ) +from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.core.column_accessor import ColumnAccessor from cudf.core.mixins import BinaryOperand, Scannable from cudf.utils import ioutils @@ -889,18 +889,21 @@ def from_arrow(cls, data: pa.Table) -> Self: for name in dict_dictionaries.keys() } - cudf_category_frame = { - name: build_categorical_column( - cudf_dictionaries_columns[name], - codes, - mask=codes.base_mask, + for name, codes in zip( + dict_indices_table.column_names, indices_columns + ): + categories = cudf_dictionaries_columns[name] + codes = as_unsigned_codes(len(categories), codes) + cudf_category_frame[name] = CategoricalColumn( + data=None, size=codes.size, - ordered=dict_ordered[name], - ) - for name, codes in zip( - dict_indices_table.column_names, indices_columns + dtype=cudf.CategoricalDtype( + categories=categories, + ordered=dict_ordered[name], + ), + mask=codes.base_mask, + children=(codes,), ) - } # Handle non-dict arrays cudf_non_category_frame = { diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 27c6556f976..500fc580097 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3079,22 +3079,8 @@ def __init__( name = _getdefault_name(data, name=name) if isinstance(data, CategoricalColumn): data = data - elif isinstance(data, pd.Series) and ( - isinstance(data.dtype, pd.CategoricalDtype) - ): - codes_data = column.as_column(data.cat.codes.values) - data = column.build_categorical_column( - categories=data.cat.categories, - codes=codes_data, - ordered=data.cat.ordered, - ) - elif isinstance(data, (pd.Categorical, pd.CategoricalIndex)): - codes_data = column.as_column(data.codes) - data = column.build_categorical_column( - categories=data.categories, - codes=codes_data, - ordered=data.ordered, - ) + elif isinstance(getattr(data, "dtype", None), pd.CategoricalDtype): + data = column.as_column(data) else: data = column.as_column( data, dtype="category" if dtype is None else dtype diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ad6aa56d472..fd6bf37f0e6 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -173,17 +173,7 @@ def _drop_columns(f: Frame, columns: abc.Iterable, errors: str): def _indices_from_labels(obj, labels): if not isinstance(labels, cudf.MultiIndex): labels = cudf.core.column.as_column(labels) - - if isinstance(obj.index.dtype, cudf.CategoricalDtype): - labels = labels.astype("category") - codes = labels.codes.astype(obj.index.codes.dtype) - labels = cudf.core.column.build_categorical_column( - categories=labels.dtype.categories, - codes=codes, - ordered=labels.dtype.ordered, - ) - else: - labels = labels.astype(obj.index.dtype) + labels = labels.astype(obj.index.dtype) idx_labels = cudf.Index._from_column(labels) else: idx_labels = labels diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4be10752651..a831a798772 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -38,7 +38,9 @@ as_column, ) from cudf.core.column.categorical import ( + _DEFAULT_CATEGORICAL_VALUE, CategoricalAccessor as CategoricalAccessor, + CategoricalColumn, ) from cudf.core.column.column import concat_columns from cudf.core.column.lists import ListMethods @@ -511,9 +513,22 @@ def from_categorical(cls, categorical, codes=None): dtype: category Categories (3, object): ['a', 'b', 'c'] """ # noqa: E501 - col = cudf.core.column.categorical.pandas_categorical_as_column( - categorical, codes=codes - ) + col = as_column(categorical) + if codes is not None: + codes = as_column(codes) + + valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE) + + mask = None + if not valid_codes.all(): + mask = libcudf.transform.bools_to_mask(valid_codes) + col = CategoricalColumn( + data=col.data, + size=codes.size, + dtype=col.dtype, + mask=mask, + children=(codes,), + ) return Series._from_column(col) @classmethod diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index eb6714029cf..55dda34a576 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -350,7 +350,6 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: def where(self, cond, other=None, inplace=False): from cudf.core._internals.where import ( _check_and_cast_columns_with_other, - _make_categorical_like, ) if isinstance(other, cudf.DataFrame): @@ -366,14 +365,12 @@ def where(self, cond, other=None, inplace=False): if not cudf.api.types.is_scalar(other): other = cudf.core.column.as_column(other) - self_column = self._column input_col, other = _check_and_cast_columns_with_other( - source_col=self_column, other=other, inplace=inplace + source_col=self._column, other=other, inplace=inplace ) result = cudf._lib.copying.copy_if_else(input_col, other, cond) - - return _make_categorical_like(result, self_column) + return result._with_type_metadata(self.dtype) @_performance_tracking def transpose(self): diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index d6b2ae2f31c..984115dcbbe 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -20,7 +20,8 @@ import cudf from cudf._lib import parquet as libparquet from cudf.api.types import is_list_like -from cudf.core.column import as_column, build_categorical_column, column_empty +from cudf.core.column import as_column, column_empty +from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking @@ -811,12 +812,17 @@ def _parquet_to_frame( partition_categories[name].index(value), length=_len, ) - dfs[-1][name] = build_categorical_column( - categories=partition_categories[name], - codes=codes, + codes = as_unsigned_codes( + len(partition_categories[name]), codes + ) + dfs[-1][name] = CategoricalColumn( + data=None, size=codes.size, + dtype=cudf.CategoricalDtype( + categories=partition_categories[name], ordered=False + ), offset=codes.offset, - ordered=False, + children=(codes,), ) else: # Not building categorical columns, so diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 5bd3eb5fa7f..9347ebba5de 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -64,8 +64,11 @@ def _nonempty_index(idx): values = cudf.core.column.as_column(data) return cudf.DatetimeIndex(values, name=idx.name) elif isinstance(idx, cudf.CategoricalIndex): - values = cudf.core.column.build_categorical_column( - categories=idx.categories, codes=[0, 0], ordered=idx.ordered + values = cudf.core.column.CategoricalColumn( + data=None, + size=None, + dtype=idx.dtype, + children=(cudf.core.column.as_column([0, 0], dtype=np.uint8),), ) return cudf.CategoricalIndex(values, name=idx.name) elif isinstance(idx, cudf.MultiIndex): @@ -105,12 +108,16 @@ def _get_non_empty_data( ) codes = cudf.core.column.as_column( 0, - dtype=cudf._lib.types.size_type_dtype, + dtype=np.uint8, length=2, ) - ordered = s.ordered # type: ignore[attr-defined] - return cudf.core.column.build_categorical_column( - categories=categories, codes=codes, ordered=ordered + return cudf.core.column.CategoricalColumn( + data=None, + size=codes.size, + dtype=cudf.CategoricalDtype( + categories=categories, ordered=s.dtype.ordered + ), + children=(codes,), # type: ignore[arg-type] ) elif isinstance(s.dtype, cudf.ListDtype): leaf_type = s.dtype.leaf_type diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index c025280c240..e793d4381d1 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -19,7 +19,7 @@ create_metadata_file_dd = None import cudf -from cudf.core.column import as_column, build_categorical_column +from cudf.core.column import CategoricalColumn, as_column from cudf.io import write_to_dataset from cudf.io.parquet import _apply_post_filters, _normalize_filters from cudf.utils.dtypes import cudf_dtype_from_pa_type @@ -163,12 +163,14 @@ def _read_paths( partitions[i].keys.get_loc(index2), length=len(df), ) - df[name] = build_categorical_column( - categories=partitions[i].keys, - codes=codes, + df[name] = CategoricalColumn( + data=None, size=codes.size, + dtype=cudf.CategoricalDtype( + categories=partitions[i].keys, ordered=False + ), offset=codes.offset, - ordered=False, + children=(codes,), ) elif name not in df.columns: # Add non-categorical partition column