diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 375c74f29e63a..3253f9ab87363 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,13 +53,11 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - - id: check-ast - id: check-case-conflict - id: check-toml - id: check-xml - id: check-yaml exclude: ^ci/meta.yaml$ - - id: debug-statements - id: end-of-file-fixer exclude: \.txt$ - id: mixed-line-ending diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cea3b51f48b68..e5b2b80e42114 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -289,6 +289,7 @@ Other - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) +- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 2e60b2a2138e3..7effc42d5ba28 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -278,20 +278,28 @@ def _get_data_buffer( """ Return the buffer containing the data and the buffer's associated dtype. """ - if self.dtype[0] in ( - DtypeKind.INT, - DtypeKind.UINT, - DtypeKind.FLOAT, - DtypeKind.BOOL, - DtypeKind.DATETIME, - ): + if self.dtype[0] == DtypeKind.DATETIME: # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make # it longer than 4 characters - if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: + if len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: np_arr = self._col.to_numpy() buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) + dtype = ( + DtypeKind.INT, + 64, + ArrowCTypes.INT64, + Endianness.NATIVE, + ) + elif self.dtype[0] in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): + np_arr = self._col.to_numpy() + buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values._codes @@ -314,7 +322,12 @@ def _get_data_buffer( # Define the dtype for the returned buffer # TODO: this will need correcting # https://github.com/pandas-dev/pandas/issues/54781 - dtype = self.dtype + dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + Endianness.NATIVE, + ) # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 4933cca97462f..e4fa6e4451a4c 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -435,3 +435,48 @@ def test_empty_dataframe(): result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) expected = pd.DataFrame({"a": []}, dtype="int8") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("data", "expected_dtype", "expected_buffer_dtype"), + [ + ( + pd.Series(["a", "b", "a"], dtype="category"), + (DtypeKind.CATEGORICAL, 8, "c", "="), + (DtypeKind.INT, 8, "c", "|"), + ), + ( + pd.Series( + [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)] + ), + (DtypeKind.DATETIME, 64, "tsn:", "="), + (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), + ), + ( + pd.Series(["a", "bc", None]), + (DtypeKind.STRING, 8, ArrowCTypes.STRING, "="), + (DtypeKind.UINT, 8, ArrowCTypes.UINT8, "="), + ), + ( + pd.Series([1, 2, 3]), + (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), + (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), + ), + ( + pd.Series([1.5, 2, 3]), + (DtypeKind.FLOAT, 64, ArrowCTypes.FLOAT64, "="), + (DtypeKind.FLOAT, 64, ArrowCTypes.FLOAT64, "="), + ), + ], +) +def test_buffer_dtype_categorical( + data: pd.Series, + expected_dtype: tuple[DtypeKind, int, str, str], + expected_buffer_dtype: tuple[DtypeKind, int, str, str], +) -> None: + # https://github.com/pandas-dev/pandas/issues/54781 + df = pd.DataFrame({"data": data}) + dfi = df.__dataframe__() + col = dfi.get_column_by_name("data") + assert col.dtype == expected_dtype + assert col.get_buffers()["data"][1] == expected_buffer_dtype