Skip to content

Commit

Permalink
REF: PandasColumn.describe_categorical returns categores instead of m…
Browse files Browse the repository at this point in the history
…apping (pandas-dev#47886)
  • Loading branch information
mroeschke authored and noatamir committed Nov 9, 2022
1 parent 5b39d00 commit f2e9513
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 18 deletions.
13 changes: 8 additions & 5 deletions pandas/core/interchange/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,18 @@ def describe_categorical(self):
"""
If the dtype is categorical, there are two options:
- There are only values in the data buffer.
- There is a separate dictionary-style encoding for categorical values.
Raises RuntimeError if the dtype is not categorical
- There is a separate non-categorical Column encoding for categorical values.
Raises TypeError if the dtype is not categorical
Content of returned dict:
- "is_ordered" : bool, whether the ordering of dictionary indices is
semantically meaningful.
- "is_dictionary" : bool, whether a dictionary-style mapping of
categorical values to other objects exists
- "mapping" : dict, Python-level only (e.g. ``{int: str}``).
None if not a dictionary-style categorical.
- "categories" : Column representing the (implicit) mapping of indices to
category values (e.g. an array of cat1, cat2, ...).
None if not a dictionary-style categorical.
"""
if not self.dtype[0] == DtypeKind.CATEGORICAL:
raise TypeError(
Expand All @@ -164,7 +167,7 @@ def describe_categorical(self):
return {
"is_ordered": self._col.cat.ordered,
"is_dictionary": True,
"mapping": dict(enumerate(self._col.cat.categories)),
"categories": PandasColumn(pd.Series(self._col.cat.categories)),
}

@property
Expand Down
11 changes: 6 additions & 5 deletions pandas/core/interchange/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ class CategoricalDescription(TypedDict):
is_dictionary: bool
# Python-level only (e.g. ``{int: str}``).
# None if not a dictionary-style categorical.
mapping: dict | None
categories: Column | None


class Buffer(ABC):
Expand Down Expand Up @@ -274,17 +274,18 @@ def describe_categorical(self) -> CategoricalDescription:
"""
If the dtype is categorical, there are two options:
- There are only values in the data buffer.
- There is a separate dictionary-style encoding for categorical values.
- There is a separate non-categorical Column encoding for categorical values.
Raises TypeError if the dtype is not categorical
Returns the dictionary with description on how to interpret the data buffer:
- "is_ordered" : bool, whether the ordering of dictionary indices is
semantically meaningful.
- "is_dictionary" : bool, whether a dictionary-style mapping of
- "is_dictionary" : bool, whether a mapping of
categorical values to other objects exists
- "mapping" : dict, Python-level only (e.g. ``{int: str}``).
None if not a dictionary-style categorical.
- "categories" : Column representing the (implicit) mapping of indices to
category values (e.g. an array of cat1, cat2, ...).
None if not a dictionary-style categorical.
TBD: are there any other in-memory representations that are needed?
"""
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/interchange/from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np

import pandas as pd
from pandas.core.interchange.column import PandasColumn
from pandas.core.interchange.dataframe_protocol import (
Buffer,
Column,
Expand Down Expand Up @@ -179,9 +180,10 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
if not categorical["is_dictionary"]:
raise NotImplementedError("Non-dictionary categoricals not supported yet")

mapping = categorical["mapping"]
assert isinstance(mapping, dict), "Categorical mapping must be a dict"
categories = np.array(tuple(mapping[k] for k in sorted(mapping)))
cat_column = categorical["categories"]
# for mypy/pyright
assert isinstance(cat_column, PandasColumn), "categories must be a PandasColumn"
categories = np.array(cat_column._col)
buffers = col.get_buffers()

codes_buff, codes_dtype = buffers["data"]
Expand Down
13 changes: 8 additions & 5 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pandas as pd
import pandas._testing as tm
from pandas.core.interchange.column import PandasColumn
from pandas.core.interchange.dataframe_protocol import (
ColumnNullType,
DtypeKind,
Expand Down Expand Up @@ -61,11 +62,13 @@ def test_categorical_dtype(data):
assert col.null_count == 0
assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1)
assert col.num_chunks() == 1
assert col.describe_categorical == {
"is_ordered": data[1],
"is_dictionary": True,
"mapping": {0: "a", 1: "d", 2: "e", 3: "s", 4: "t"},
}
desc_cat = col.describe_categorical
assert desc_cat["is_ordered"] == data[1]
assert desc_cat["is_dictionary"] is True
assert isinstance(desc_cat["categories"], PandasColumn)
tm.assert_series_equal(
desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"])
)

tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))

Expand Down

0 comments on commit f2e9513

Please sign in to comment.