Refactor dataframe interchange namespace

pola-rs · Jul 4, 2022 · c05edcc · c05edcc
1 parent 937627b
commit c05edcc
Show file tree

Hide file tree

Showing 8 changed files with 715 additions and 481 deletions.
diff --git a/py-polars/polars/interchange/__init__.py b/py-polars/polars/interchange/__init__.py
diff --git a/py-polars/polars/interchange/_utils.py b/py-polars/polars/interchange/_utils.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+from enum import IntEnum
+from typing import Any, Optional, TypedDict
+
+import polars as pl
+
+
+class _IXDtypeKind(IntEnum):
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+class _IXNullKind(IntEnum):
+    NON_NULLABLE = 0
+    USE_NAN = 1
+    USE_SENTINEL = 2
+    USE_BITMASK = 3
+    USE_BYTEMASK = 4
+
+
+class _IXCategoricalDescription(TypedDict):
+    """See ``IXColumn.describe_categorical`` for more."""
+
+    is_ordered: bool
+    is_dictionary: bool
+    mapping: Optional[dict]
+
+
+class _IXArrowCTypes:
+    """
+    Enum for Apache Arrow C type format strings.
+
+    The Arrow C data interface:
+    https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
+    """
+
+    NULL = "n"
+    BOOL = "b"
+    INT8 = "c"
+    UINT8 = "C"
+    INT16 = "s"
+    UINT16 = "S"
+    INT32 = "i"
+    UINT32 = "I"
+    INT64 = "l"
+    UINT64 = "L"
+    FLOAT16 = "e"
+    FLOAT32 = "f"
+    FLOAT64 = "g"
+    STRING = "u"  # utf-8
+    DATE32 = "tdD"
+    DATE64 = "tdm"
+    # Resoulution:
+    #   - seconds -> 's'
+    #   - milliseconds -> 'm'
+    #   - microseconds -> 'u'
+    #   - nanoseconds -> 'n'
+    TIMESTAMP = "ts{resolution}:{tz}"
+    TIME = "tt{resolution}"
+
+
+class _IXEndianness:
+    """Enum indicating the byte-order of a data-type."""
+
+    LITTLE = "<"
+    BIG = ">"
+    NATIVE = "="
+    NA = "|"
+
+
+# TODO: remove Any
+def is_string_dtype(dtype: Any) -> bool:
+    ...
+
+
+# TODO: remove Any
+def _ix_chunk_to_polars_df(chunk: Any) -> pl.DataFrame:
+    pass
diff --git a/py-polars/polars/interchange/buffer.py b/py-polars/polars/interchange/buffer.py
@@ -0,0 +1,88 @@
+from enum import IntEnum
+
+# TODO: determine if we want to do this
+_PYARROW_AVAILABLE = True
+try:
+    import pyarrow as pa
+except ImportError:
+    _PYARROW_AVAILABLE = False
+
+# TODO
+NotSupportedError = Exception
+
+
+class _IXBuffer:
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+
+    Note that there is no dtype attribute present, a buffer can be thought of
+    as simply a block of memory. However, if the column that the buffer is
+    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+    implemented, then that dtype information will be contained in the return
+    value from ``__dlpack__``.
+
+    This distinction is useful to support both data exchange via DLPack on a
+    buffer and (b) dtypes like variable-length strings which do not have a
+    fixed number of bytes per element.
+    """
+
+    def __init__(self, arr: pa.Array, allow_copy: bool = True) -> None:
+        if not isinstance(arr, pa.Array):
+            raise ValueError("`arr` must be a pyarrow Array")
+
+        # TODO: offsets are unhandled
+        # if arr.offset != 0:
+        #     raise NotImplementedError("`arr`s with offsets are not supported")
+
+        if arr.num_chunks() > 1:
+            if not allow_copy:
+                raise NotSupportedError("A copy is needed to flatten the ChunkedArray")
+
+            arr = arr.combine_chunks()
+
+        self._arr = arr
+        self._allow_copy = allow_copy
+
+    @property
+    def bufsize(self) -> int:
+        """Buffer size in bytes."""
+        # TODO: what about nbytes
+        return self._arr.get_total_buffer_size()
+
+    @property
+    def ptr(self) -> int:
+        """Po to start of the buffer as an integer."""
+        ...
+
+    def __dlpack__(self) -> None:
+        """
+        Produce DLPack capsule (see array API standard).
+
+        Raises: (TODO)
+
+            - TypeError : if the buffer contains unsupported dtypes.
+            - NotImplementedError : if DLPack support is not implemented
+
+        Useful to have to connect to array libraries. Support optional because
+        it's not completely trivial to implement for a Python-only library.
+        """
+        ...
+
+    def __dlpack_device__(self) -> tuple[IntEnum, int]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+
+        Uses device type codes matching DLPack. Enum members are::
+
+            - CPU = 1
+            - CUDA = 2
+            - CPU_PINNED = 3
+            - OPENCL = 4
+            - VULKAN = 7
+            - METAL = 8
+            - VPI = 9
+            - ROCM = 10
+
+        Note: must be implemented even if ``__dlpack__`` is not.
+        """
+        ...