Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Implement DataFrame Interchange Protocol #3727

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions py-polars/polars/interchange/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from polars.interchange.dataframe import from_dataframe

__all__ = ["from_dataframe"]
84 changes: 84 additions & 0 deletions py-polars/polars/interchange/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from __future__ import annotations

from enum import IntEnum
from typing import Any, Optional, TypedDict

import polars as pl


class _IXDtypeKind(IntEnum):
INT = 0
UINT = 1
FLOAT = 2
BOOL = 20
STRING = 21 # UTF-8
DATETIME = 22
CATEGORICAL = 23


class _IXNullKind(IntEnum):
NON_NULLABLE = 0
USE_NAN = 1
USE_SENTINEL = 2
USE_BITMASK = 3
USE_BYTEMASK = 4


class _IXCategoricalDescription(TypedDict):
"""See ``IXColumn.describe_categorical`` for more."""

is_ordered: bool
is_dictionary: bool
mapping: Optional[dict]


class _IXArrowCTypes:
"""
Enum for Apache Arrow C type format strings.

The Arrow C data interface:
https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
"""

NULL = "n"
BOOL = "b"
INT8 = "c"
UINT8 = "C"
INT16 = "s"
UINT16 = "S"
INT32 = "i"
UINT32 = "I"
INT64 = "l"
UINT64 = "L"
FLOAT16 = "e"
FLOAT32 = "f"
FLOAT64 = "g"
STRING = "u" # utf-8
DATE32 = "tdD"
DATE64 = "tdm"
# Resoulution:
# - seconds -> 's'
# - milliseconds -> 'm'
# - microseconds -> 'u'
# - nanoseconds -> 'n'
TIMESTAMP = "ts{resolution}:{tz}"
TIME = "tt{resolution}"


class _IXEndianness:
"""Enum indicating the byte-order of a data-type."""

LITTLE = "<"
BIG = ">"
NATIVE = "="
NA = "|"


# TODO: remove Any
def is_string_dtype(dtype: Any) -> bool:
...


# TODO: remove Any
def _ix_chunk_to_polars_df(chunk: Any) -> pl.DataFrame:
pass
88 changes: 88 additions & 0 deletions py-polars/polars/interchange/buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from enum import IntEnum

# TODO: determine if we want to do this
_PYARROW_AVAILABLE = True
try:
import pyarrow as pa
except ImportError:
_PYARROW_AVAILABLE = False

# TODO
NotSupportedError = Exception


class _IXBuffer:
"""
Data in the buffer is guaranteed to be contiguous in memory.

Note that there is no dtype attribute present, a buffer can be thought of
as simply a block of memory. However, if the column that the buffer is
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
implemented, then that dtype information will be contained in the return
value from ``__dlpack__``.

This distinction is useful to support both data exchange via DLPack on a
buffer and (b) dtypes like variable-length strings which do not have a
fixed number of bytes per element.
"""

def __init__(self, arr: pa.Array, allow_copy: bool = True) -> None:
if not isinstance(arr, pa.Array):
raise ValueError("`arr` must be a pyarrow Array")

# TODO: offsets are unhandled
# if arr.offset != 0:
# raise NotImplementedError("`arr`s with offsets are not supported")

if arr.num_chunks() > 1:
if not allow_copy:
raise NotSupportedError("A copy is needed to flatten the ChunkedArray")

arr = arr.combine_chunks()

self._arr = arr
self._allow_copy = allow_copy

@property
def bufsize(self) -> int:
"""Buffer size in bytes."""
# TODO: what about nbytes
return self._arr.get_total_buffer_size()

@property
def ptr(self) -> int:
"""Po to start of the buffer as an integer."""
...

def __dlpack__(self) -> None:
"""
Produce DLPack capsule (see array API standard).

Raises: (TODO)

- TypeError : if the buffer contains unsupported dtypes.
- NotImplementedError : if DLPack support is not implemented

Useful to have to connect to array libraries. Support optional because
it's not completely trivial to implement for a Python-only library.
"""
...

def __dlpack_device__(self) -> tuple[IntEnum, int]:
"""
Device type and device ID for where the data in the buffer resides.

Uses device type codes matching DLPack. Enum members are::

- CPU = 1
- CUDA = 2
- CPU_PINNED = 3
- OPENCL = 4
- VULKAN = 7
- METAL = 8
- VPI = 9
- ROCM = 10

Note: must be implemented even if ``__dlpack__`` is not.
"""
...
Loading