Skip to content

Commit

Permalink
Refactor dataframe interchange namespace
Browse files Browse the repository at this point in the history
  • Loading branch information
cnpryer committed Jul 4, 2022
1 parent 937627b commit c05edcc
Show file tree
Hide file tree
Showing 8 changed files with 715 additions and 481 deletions.
480 changes: 2 additions & 478 deletions py-polars/polars/interchange/__init__.py

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions py-polars/polars/interchange/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from __future__ import annotations

from enum import IntEnum
from typing import Any, Optional, TypedDict

import polars as pl


class _IXDtypeKind(IntEnum):
INT = 0
UINT = 1
FLOAT = 2
BOOL = 20
STRING = 21 # UTF-8
DATETIME = 22
CATEGORICAL = 23


class _IXNullKind(IntEnum):
NON_NULLABLE = 0
USE_NAN = 1
USE_SENTINEL = 2
USE_BITMASK = 3
USE_BYTEMASK = 4


class _IXCategoricalDescription(TypedDict):
"""See ``IXColumn.describe_categorical`` for more."""

is_ordered: bool
is_dictionary: bool
mapping: Optional[dict]


class _IXArrowCTypes:
"""
Enum for Apache Arrow C type format strings.
The Arrow C data interface:
https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
"""

NULL = "n"
BOOL = "b"
INT8 = "c"
UINT8 = "C"
INT16 = "s"
UINT16 = "S"
INT32 = "i"
UINT32 = "I"
INT64 = "l"
UINT64 = "L"
FLOAT16 = "e"
FLOAT32 = "f"
FLOAT64 = "g"
STRING = "u" # utf-8
DATE32 = "tdD"
DATE64 = "tdm"
# Resoulution:
# - seconds -> 's'
# - milliseconds -> 'm'
# - microseconds -> 'u'
# - nanoseconds -> 'n'
TIMESTAMP = "ts{resolution}:{tz}"
TIME = "tt{resolution}"


class _IXEndianness:
"""Enum indicating the byte-order of a data-type."""

LITTLE = "<"
BIG = ">"
NATIVE = "="
NA = "|"


# TODO: remove Any
def is_string_dtype(dtype: Any) -> bool:
...


# TODO: remove Any
def _ix_chunk_to_polars_df(chunk: Any) -> pl.DataFrame:
pass
88 changes: 88 additions & 0 deletions py-polars/polars/interchange/buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from enum import IntEnum

# TODO: determine if we want to do this
_PYARROW_AVAILABLE = True
try:
import pyarrow as pa
except ImportError:
_PYARROW_AVAILABLE = False

# TODO
NotSupportedError = Exception


class _IXBuffer:
"""
Data in the buffer is guaranteed to be contiguous in memory.
Note that there is no dtype attribute present, a buffer can be thought of
as simply a block of memory. However, if the column that the buffer is
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
implemented, then that dtype information will be contained in the return
value from ``__dlpack__``.
This distinction is useful to support both data exchange via DLPack on a
buffer and (b) dtypes like variable-length strings which do not have a
fixed number of bytes per element.
"""

def __init__(self, arr: pa.Array, allow_copy: bool = True) -> None:
if not isinstance(arr, pa.Array):
raise ValueError("`arr` must be a pyarrow Array")

# TODO: offsets are unhandled
# if arr.offset != 0:
# raise NotImplementedError("`arr`s with offsets are not supported")

if arr.num_chunks() > 1:
if not allow_copy:
raise NotSupportedError("A copy is needed to flatten the ChunkedArray")

arr = arr.combine_chunks()

self._arr = arr
self._allow_copy = allow_copy

@property
def bufsize(self) -> int:
"""Buffer size in bytes."""
# TODO: what about nbytes
return self._arr.get_total_buffer_size()

@property
def ptr(self) -> int:
"""Po to start of the buffer as an integer."""
...

def __dlpack__(self) -> None:
"""
Produce DLPack capsule (see array API standard).
Raises: (TODO)
- TypeError : if the buffer contains unsupported dtypes.
- NotImplementedError : if DLPack support is not implemented
Useful to have to connect to array libraries. Support optional because
it's not completely trivial to implement for a Python-only library.
"""
...

def __dlpack_device__(self) -> tuple[IntEnum, int]:
"""
Device type and device ID for where the data in the buffer resides.
Uses device type codes matching DLPack. Enum members are::
- CPU = 1
- CUDA = 2
- CPU_PINNED = 3
- OPENCL = 4
- VULKAN = 7
- METAL = 8
- VPI = 9
- ROCM = 10
Note: must be implemented even if ``__dlpack__`` is not.
"""
...
Loading

0 comments on commit c05edcc

Please sign in to comment.