-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor dataframe interchange namespace
- Loading branch information
Showing
8 changed files
with
715 additions
and
481 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from __future__ import annotations | ||
|
||
from enum import IntEnum | ||
from typing import Any, Optional, TypedDict | ||
|
||
import polars as pl | ||
|
||
|
||
class _IXDtypeKind(IntEnum): | ||
INT = 0 | ||
UINT = 1 | ||
FLOAT = 2 | ||
BOOL = 20 | ||
STRING = 21 # UTF-8 | ||
DATETIME = 22 | ||
CATEGORICAL = 23 | ||
|
||
|
||
class _IXNullKind(IntEnum): | ||
NON_NULLABLE = 0 | ||
USE_NAN = 1 | ||
USE_SENTINEL = 2 | ||
USE_BITMASK = 3 | ||
USE_BYTEMASK = 4 | ||
|
||
|
||
class _IXCategoricalDescription(TypedDict): | ||
"""See ``IXColumn.describe_categorical`` for more.""" | ||
|
||
is_ordered: bool | ||
is_dictionary: bool | ||
mapping: Optional[dict] | ||
|
||
|
||
class _IXArrowCTypes: | ||
""" | ||
Enum for Apache Arrow C type format strings. | ||
The Arrow C data interface: | ||
https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings | ||
""" | ||
|
||
NULL = "n" | ||
BOOL = "b" | ||
INT8 = "c" | ||
UINT8 = "C" | ||
INT16 = "s" | ||
UINT16 = "S" | ||
INT32 = "i" | ||
UINT32 = "I" | ||
INT64 = "l" | ||
UINT64 = "L" | ||
FLOAT16 = "e" | ||
FLOAT32 = "f" | ||
FLOAT64 = "g" | ||
STRING = "u" # utf-8 | ||
DATE32 = "tdD" | ||
DATE64 = "tdm" | ||
# Resoulution: | ||
# - seconds -> 's' | ||
# - milliseconds -> 'm' | ||
# - microseconds -> 'u' | ||
# - nanoseconds -> 'n' | ||
TIMESTAMP = "ts{resolution}:{tz}" | ||
TIME = "tt{resolution}" | ||
|
||
|
||
class _IXEndianness: | ||
"""Enum indicating the byte-order of a data-type.""" | ||
|
||
LITTLE = "<" | ||
BIG = ">" | ||
NATIVE = "=" | ||
NA = "|" | ||
|
||
|
||
# TODO: remove Any | ||
def is_string_dtype(dtype: Any) -> bool: | ||
... | ||
|
||
|
||
# TODO: remove Any | ||
def _ix_chunk_to_polars_df(chunk: Any) -> pl.DataFrame: | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from enum import IntEnum | ||
|
||
# TODO: determine if we want to do this | ||
_PYARROW_AVAILABLE = True | ||
try: | ||
import pyarrow as pa | ||
except ImportError: | ||
_PYARROW_AVAILABLE = False | ||
|
||
# TODO | ||
NotSupportedError = Exception | ||
|
||
|
||
class _IXBuffer: | ||
""" | ||
Data in the buffer is guaranteed to be contiguous in memory. | ||
Note that there is no dtype attribute present, a buffer can be thought of | ||
as simply a block of memory. However, if the column that the buffer is | ||
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is | ||
implemented, then that dtype information will be contained in the return | ||
value from ``__dlpack__``. | ||
This distinction is useful to support both data exchange via DLPack on a | ||
buffer and (b) dtypes like variable-length strings which do not have a | ||
fixed number of bytes per element. | ||
""" | ||
|
||
def __init__(self, arr: pa.Array, allow_copy: bool = True) -> None: | ||
if not isinstance(arr, pa.Array): | ||
raise ValueError("`arr` must be a pyarrow Array") | ||
|
||
# TODO: offsets are unhandled | ||
# if arr.offset != 0: | ||
# raise NotImplementedError("`arr`s with offsets are not supported") | ||
|
||
if arr.num_chunks() > 1: | ||
if not allow_copy: | ||
raise NotSupportedError("A copy is needed to flatten the ChunkedArray") | ||
|
||
arr = arr.combine_chunks() | ||
|
||
self._arr = arr | ||
self._allow_copy = allow_copy | ||
|
||
@property | ||
def bufsize(self) -> int: | ||
"""Buffer size in bytes.""" | ||
# TODO: what about nbytes | ||
return self._arr.get_total_buffer_size() | ||
|
||
@property | ||
def ptr(self) -> int: | ||
"""Po to start of the buffer as an integer.""" | ||
... | ||
|
||
def __dlpack__(self) -> None: | ||
""" | ||
Produce DLPack capsule (see array API standard). | ||
Raises: (TODO) | ||
- TypeError : if the buffer contains unsupported dtypes. | ||
- NotImplementedError : if DLPack support is not implemented | ||
Useful to have to connect to array libraries. Support optional because | ||
it's not completely trivial to implement for a Python-only library. | ||
""" | ||
... | ||
|
||
def __dlpack_device__(self) -> tuple[IntEnum, int]: | ||
""" | ||
Device type and device ID for where the data in the buffer resides. | ||
Uses device type codes matching DLPack. Enum members are:: | ||
- CPU = 1 | ||
- CUDA = 2 | ||
- CPU_PINNED = 3 | ||
- OPENCL = 4 | ||
- VULKAN = 7 | ||
- METAL = 8 | ||
- VPI = 9 | ||
- ROCM = 10 | ||
Note: must be implemented even if ``__dlpack__`` is not. | ||
""" | ||
... |
Oops, something went wrong.