Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: interchange protocol with nullable datatypes a non-null validity #57665

Merged
merged 11 commits into from
Mar 7, 2024
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`)
-

.. ---------------------------------------------------------------------------
Expand Down
18 changes: 17 additions & 1 deletion pandas/core/interchange/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ def describe_categorical(self):

@property
def describe_null(self):
if isinstance(self._col.dtype, BaseMaskedDtype):
column_null_dtype = ColumnNullType.USE_BYTEMASK
null_value = 1
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
return column_null_dtype, null_value
kind = self.dtype[0]
try:
null, value = _NULL_DESCRIPTION[kind]
Expand Down Expand Up @@ -298,7 +302,13 @@ def _get_data_buffer(
DtypeKind.FLOAT,
DtypeKind.BOOL,
):
np_arr = self._col.to_numpy()
arr = self._col.array
if isinstance(self._col.dtype, BaseMaskedDtype):
np_arr = arr._data # type: ignore[attr-defined]
elif isinstance(self._col.dtype, ArrowDtype):
raise NotImplementedError("ArrowDtype not handled yet")
else:
np_arr = arr._ndarray # type: ignore[attr-defined]
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
dtype = self.dtype
elif self.dtype[0] == DtypeKind.CATEGORICAL:
Expand Down Expand Up @@ -341,6 +351,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
"""
null, invalid = self.describe_null

if isinstance(self._col.dtype, BaseMaskedDtype):
mask = self._col.array._mask # type: ignore[attr-defined]
buffer = PandasBuffer(mask)
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
return buffer, dtype

if self.dtype[0] == DtypeKind.STRING:
# For now, use byte array as the mask.
# TODO: maybe store as bit array to save space?..
Expand Down
44 changes: 38 additions & 6 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
is_ci_environment,
is_platform_windows,
)
import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm
Expand Down Expand Up @@ -417,17 +416,50 @@ def test_non_str_names_w_duplicates():
pd.api.interchange.from_dataframe(dfi, allow_copy=False)


@pytest.mark.parametrize(
"dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))]
)
def test_nullable_integers(dtype: str) -> None:
def test_nullable_integers() -> None:
Comment on lines -420 to +419
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

splitting the pyarrow dtype out of this test so I can xfail it

# https://github.com/pandas-dev/pandas/issues/55069
df = pd.DataFrame({"a": [1]}, dtype=dtype)
df = pd.DataFrame({"a": [1]}, dtype="Int8")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you include a None in here to test null?

And parametrize over ["Int8", "Float32", "boolean"]?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

boolean isn't supported yet, there's a separate issue about that: #55332

sure, I've added a separate test test_pandas_nullable_w_missing_values which includes a null

expected = pd.DataFrame({"a": [1]}, dtype="int8")
result = pd.api.interchange.from_dataframe(df.__dataframe__())
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664")
def test_nullable_integers_pyarrow() -> None:
# https://github.com/pandas-dev/pandas/issues/55069
df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]")
expected = pd.DataFrame({"a": [1]}, dtype="int8")
result = pd.api.interchange.from_dataframe(df.__dataframe__())
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
("data", "dtype", "expected_dtype"),
[
([1, 2, None], "Int64", "int64"),
(
[1, 2, None],
"UInt64",
"uint64",
),
([1.0, 2.25, None], "Float32", "float32"),
],
)
def test_pandas_nullable_w_missing_values(
data: list, dtype: str, expected_dtype: str
) -> None:
# https://github.com/pandas-dev/pandas/issues/57643
pytest.importorskip("pyarrow", "11.0.0")
import pyarrow.interchange as pai
mroeschke marked this conversation as resolved.
Show resolved Hide resolved

df = pd.DataFrame({"a": data}, dtype=dtype)
result = pai.from_dataframe(df.__dataframe__())["a"]
assert result.type == expected_dtype
assert result[0].as_py() == data[0]
assert result[1].as_py() == data[1]
assert result[2].as_py() is None


def test_empty_dataframe():
# https://github.com/pandas-dev/pandas/issues/56700
df = pd.DataFrame({"a": []}, dtype="int8")
Expand Down