Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate to Arrow Pycapsule interface #63

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions h3ronpy/CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Unreleased
----------

- The minimum supported python version is now 3.9.
- Migrate to the Arrow PyCapsule Interface, supported by pyarrow, polars (v1.2+), pandas (v2.2+), nanoarrow, ...

0.21.1 - 2024-10-04
-------------------
Expand Down
3 changes: 2 additions & 1 deletion h3ronpy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ name = "h3ronpy"
crate-type = ["cdylib"]

[dependencies]
arrow = { workspace = true, features = ["pyarrow"] }
arrow = { workspace = true}
env_logger = "^0.11"
geo-types = { workspace = true }
geo = { workspace = true }
Expand All @@ -26,5 +26,6 @@ numpy = "0.21"
ordered-float = ">=2.0.1"
py_geo_interface = { version = "0.8", features = ["f64", "wkb"] }
pyo3 = { version = "^0.21", features = ["extension-module", "abi3", "abi3-py39"] }
pyo3-arrow = "0.2"
rasterh3 = { version = "^0.8", features = ["rayon"] }
rayon = { workspace = true }
8 changes: 6 additions & 2 deletions h3ronpy/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ readme = "../README.rst"

dependencies = [
"numpy<2",
"pyarrow>=17.0"
"arro3-core"
]
classifiers = [
"Programming Language :: Python :: 3",
Expand All @@ -32,11 +32,15 @@ classifiers = [

[project.optional-dependencies]
polars = [
"polars>=1"
"polars>=1.2"
]
pandas = [
"pyarrow>=17.0",
"geopandas>=1"
]
pyarrow = [
"pyarrow>=17.0"
]
test = [
"rasterio",
"Shapely>=1.7",
Expand Down
43 changes: 18 additions & 25 deletions h3ronpy/python/h3ronpy/arrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ def _to_arrow_array(arr, dtype) -> pa.Array:
return converted


def _to_uint64_array(arr) -> pa.Array:
return _to_arrow_array(arr, pa.uint64())


def change_resolution(arr, resolution: int) -> pa.Array:
"""
Expand All @@ -38,7 +35,7 @@ def change_resolution(arr, resolution: int) -> pa.Array:

Invalid/empty values are omitted.
"""
return op.change_resolution(_to_uint64_array(arr), resolution)
return op.change_resolution(arr, resolution)


def change_resolution_list(arr, resolution: int) -> pa.Array:
Expand All @@ -50,7 +47,7 @@ def change_resolution_list(arr, resolution: int) -> pa.Array:

Invalid/empty values are preserved as such.
"""
return op.change_resolution_list(_to_uint64_array(arr), resolution)
return op.change_resolution_list(arr, resolution)


def change_resolution_paired(arr, resolution: int) -> pa.Table:
Expand All @@ -61,7 +58,7 @@ def change_resolution_paired(arr, resolution: int) -> pa.Table:
This can be helpful when joining data in different resolutions via
dataframe libraries
"""
return op.change_resolution_paired(_to_uint64_array(arr), resolution)
return op.change_resolution_paired(arr, resolution)


def cells_resolution(arr) -> pa.Array:
Expand All @@ -72,7 +69,7 @@ def cells_resolution(arr) -> pa.Array:
:param arr:
:return:
"""
return op.cells_resolution(_to_uint64_array(arr))
return op.cells_resolution(arr)


def cells_parse(arr, set_failing_to_invalid: bool = False) -> pa.Array:
Expand Down Expand Up @@ -121,7 +118,7 @@ def compact(arr, mixed_resolutions: bool = False) -> pa.Array:
The cells are expected to be of the same resolution, otherwise this operation will fail unless
`mixed_resolutions` is set to True. Setting this may lead to slight slow-downs.
"""
return op.compact(_to_uint64_array(arr), mixed_resolutions=mixed_resolutions)
return op.compact(arr, mixed_resolutions=mixed_resolutions)


def uncompact(arr, target_resolution: int) -> pa.Array:
Expand All @@ -131,12 +128,12 @@ def uncompact(arr, target_resolution: int) -> pa.Array:
All higher resolution cells contained in the input array than the given `target_resolution` will
be omitted from the output.
"""
return op.uncompact(_to_uint64_array(arr), target_resolution)
return op.uncompact(arr, target_resolution)


def _make_h3index_valid_wrapper(fn, h3index_name, wrapper_name):
def valid_wrapper(arr, booleanarray: bool = False) -> pa.Array:
return fn(_to_uint64_array(arr), booleanarray=booleanarray)
return fn(arr, booleanarray=booleanarray)

valid_wrapper.__doc__ = f"""
Validate an array of potentially invalid {h3index_name} values by returning a new
Expand All @@ -155,46 +152,46 @@ def valid_wrapper(arr, booleanarray: bool = False) -> pa.Array:


def grid_disk(cellarray, k: int, flatten: bool = False) -> Union[pa.ListArray, pa.Array]:
return op.grid_disk(_to_uint64_array(cellarray), k, flatten=flatten)
return op.grid_disk(cellarray, k, flatten=flatten)


def grid_disk_distances(cellarray, k: int, flatten: bool = False) -> pa.Table:
return op.grid_disk_distances(_to_uint64_array(cellarray), k, flatten=flatten)
return op.grid_disk_distances(cellarray, k, flatten=flatten)


def grid_disk_aggregate_k(cellarray, k: int, aggregation_method: str) -> pa.Table:
"""
Valid values for `aggregation_method` are `"min"` and `"max"`.
"""
return op.grid_disk_aggregate_k(_to_uint64_array(cellarray), k, aggregation_method)
return op.grid_disk_aggregate_k(cellarray, k, aggregation_method)


def grid_ring_distances(cellarray, k_min: int, k_max: int, flatten: bool = False) -> pa.Table:
return op.grid_ring_distances(_to_uint64_array(cellarray), k_min, k_max, flatten=flatten)
return op.grid_ring_distances(cellarray, k_min, k_max, flatten=flatten)


def cells_area_m2(cellarray) -> pa.Array:
return op.cells_area_m2(_to_uint64_array(cellarray))
return op.cells_area_m2(cellarray)


def cells_area_km2(cellarray) -> pa.Array:
return op.cells_area_km2(_to_uint64_array(cellarray))
return op.cells_area_km2(cellarray)


def cells_area_rads2(cellarray) -> pa.Array:
return op.cells_area_rads2(_to_uint64_array(cellarray))
return op.cells_area_rads2(cellarray)


def cells_to_string(cellarray) -> pa.Array:
return op.cells_to_string(_to_uint64_array(cellarray))
return op.cells_to_string(cellarray)


def vertexes_to_string(vertexesarray) -> pa.Array:
return op.vertexes_to_string(_to_uint64_array(vertexesarray))
return op.vertexes_to_string(vertexesarray)


def directededges_to_string(directededgearray) -> pa.Array:
return op.directededges_to_string(_to_uint64_array(directededgearray))
return op.directededges_to_string(directededgearray)


def cells_to_localij(cellarray, anchor, set_failing_to_invalid: bool = False) -> pa.Table:
Expand All @@ -213,9 +210,7 @@ def cells_to_localij(cellarray, anchor, set_failing_to_invalid: bool = False) ->
successfully. When `set_failing_to_invalid` is set to True, only the failing positions
of the output arrays will be set to null.
"""
if type(anchor) is not int:
anchor = _to_uint64_array(anchor)
return op.cells_to_localij(_to_uint64_array(cellarray), anchor, set_failing_to_invalid=set_failing_to_invalid)
return op.cells_to_localij(cellarray, anchor, set_failing_to_invalid=set_failing_to_invalid)


def localij_to_cells(anchor, i, j, set_failing_to_invalid: bool = False) -> pa.Array:
Expand All @@ -226,8 +221,6 @@ def localij_to_cells(anchor, i, j, set_failing_to_invalid: bool = False) -> pa.A
successfully. When `set_failing_to_invalid` is set to True, only the failing positions
of the output arrays will be set to null.
"""
if type(anchor) is not int:
anchor = _to_uint64_array(anchor)
return op.localij_to_cells(
anchor,
_to_arrow_array(i, pa.int32()),
Expand Down
12 changes: 4 additions & 8 deletions h3ronpy/python/h3ronpy/arrow/raster.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

from h3ronpy.h3ronpyrs import raster
from .. import DEFAULT_CELL_COLUMN_NAME
from . import _to_uint64_array, _to_arrow_array
from . import _to_arrow_array
from .vector import cells_to_wkb_polygons, cells_bounds
import numpy as np
import pyarrow as pa
Expand Down Expand Up @@ -95,7 +95,7 @@ def raster_to_dataframe(
:param h3_resolution: Target h3 resolution
:param compact: Return compacted h3 indexes (see H3 docs). This results in mixed H3 resolutions, but also can
reduce the amount of required memory.
:return: Tuple of arrow arrays
:return: arrow table
"""

dtype = in_raster.dtype
Expand Down Expand Up @@ -123,10 +123,7 @@ def raster_to_dataframe(
else:
raise NotImplementedError(f"no raster_to_h3 implementation for dtype {dtype.name}")

return pa.Table.from_arrays(
arrays=func(in_raster, _get_transform(transform), h3_resolution, axis_order, compact, nodata_value),
names=["value", DEFAULT_CELL_COLUMN_NAME],
)
return func(in_raster, _get_transform(transform), h3_resolution, axis_order, compact, nodata_value)


def rasterize_cells(
Expand All @@ -149,7 +146,6 @@ def rasterize_cells(
from rasterio.features import rasterize
import shapely

cells = _to_uint64_array(cells)
values = _to_arrow_array(values, None)

if len(cells) != len(values):
Expand Down Expand Up @@ -190,7 +186,7 @@ def rasterize_cells(

# linking cells should speed up rendering in case of large homogenous areas
polygons = cells_to_wkb_polygons(cells, link_cells=True)
polygons = [shapely.from_wkb(polygon.as_py()) for polygon in polygons.filter(polygons.is_valid())]
polygons = [shapely.from_wkb(polygon.as_py()) for polygon in polygons if polygon]

# draw
rasterize(
Expand Down
16 changes: 8 additions & 8 deletions h3ronpy/python/h3ronpy/arrow/vector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from h3ronpy.h3ronpyrs import vector
from .. import ContainmentMode
from . import _to_uint64_array, _HAS_POLARS, _to_arrow_array
from . import _HAS_POLARS, _to_arrow_array
from typing import Optional, Tuple, Union
import pyarrow as pa

Expand All @@ -9,7 +9,7 @@ def cells_to_coordinates(arr, radians: bool = False) -> pa.Table:
"""
convert to point coordinates in degrees
"""
return vector.cells_to_coordinates(_to_uint64_array(arr), radians=radians)
return vector.cells_to_coordinates(arr, radians=radians)


def coordinates_to_cells(latarray, lngarray, resarray, radians: bool = False) -> pa.Array:
Expand All @@ -35,15 +35,15 @@ def cells_bounds(arr) -> Optional[Tuple]:
"""
Bounds of the complete array as a tuple `(minx, miny, maxx, maxy)`.
"""
return vector.cells_bounds(_to_uint64_array(arr))
return vector.cells_bounds(arr)


def cells_bounds_arrays(arr) -> pa.Table:
"""
Build a table/dataframe with the columns `minx`, `miny`, `maxx` and `maxy` containing the bounds of the individual
cells from the input array.
"""
return vector.cells_bounds_arrays(_to_uint64_array(arr))
return vector.cells_bounds_arrays(arr)


def cells_to_wkb_polygons(arr, radians: bool = False, link_cells: bool = False) -> pa.Array:
Expand All @@ -57,7 +57,7 @@ def cells_to_wkb_polygons(arr, radians: bool = False, link_cells: bool = False)
:param radians: Generate geometries using radians instead of degrees
:param link_cells: Combine neighboring cells into a single polygon geometry.
"""
return vector.cells_to_wkb_polygons(_to_uint64_array(arr), radians=radians, link_cells=link_cells)
return vector.cells_to_wkb_polygons(arr, radians=radians, link_cells=link_cells)


def cells_to_wkb_points(arr, radians: bool = False) -> pa.Array:
Expand All @@ -69,7 +69,7 @@ def cells_to_wkb_points(arr, radians: bool = False) -> pa.Array:
:param: arr: The cell array
:param radians: Generate geometries using radians instead of degrees
"""
return vector.cells_to_wkb_points(_to_uint64_array(arr), radians=radians)
return vector.cells_to_wkb_points(arr, radians=radians)


def vertexes_to_wkb_points(arr, radians: bool = False) -> pa.Array:
Expand All @@ -81,7 +81,7 @@ def vertexes_to_wkb_points(arr, radians: bool = False) -> pa.Array:
:param: arr: The vertex array
:param radians: Generate geometries using radians instead of degrees
"""
return vector.vertexes_to_wkb_points(_to_uint64_array(arr), radians=radians)
return vector.vertexes_to_wkb_points(arr, radians=radians)


def directededges_to_wkb_linestrings(arr, radians: bool = False) -> pa.Array:
Expand All @@ -93,7 +93,7 @@ def directededges_to_wkb_linestrings(arr, radians: bool = False) -> pa.Array:
:param: arr: The directed edge array
:param radians: Generate geometries using radians instead of degrees
"""
return vector.directededges_to_wkb_linestrings(_to_uint64_array(arr), radians=radians)
return vector.directededges_to_wkb_linestrings(arr, radians=radians)


def wkb_to_cells(
Expand Down
14 changes: 10 additions & 4 deletions h3ronpy/python/h3ronpy/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,23 @@
from .. import arrow as _arrow
import pyarrow as pa
from functools import wraps
from arro3.core import Array, Table
import pandas as pd
import pyarrow as pa


def _wrap(func, ret_type=None):
@wraps(func)
def wrapper(*args, **kw):
result = func(*args, **kw)
if isinstance(result, pa.Table):
return result.to_pandas(split_blocks=True, self_destruct=True)
elif isinstance(result, pa.Array):
return result.to_pandas()
if isinstance(result, Array):
return pa.array(result).to_pandas()
elif isinstance(result, Table):
return pa.table(result).to_pandas(split_blocks=True, self_destruct=True)
#elif isinstance(result, pa.Table):
# return result.to_pandas(split_blocks=True, self_destruct=True)
#elif isinstance(result, pa.Array):
# return result.to_pandas()
return result

if ret_type:
Expand Down
5 changes: 3 additions & 2 deletions h3ronpy/python/h3ronpy/pandas/raster.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import geopandas as gpd
import numpy as np
import pandas as pd
import pyarrow as pa
import typing

from ..arrow import raster as arrow_raster
Expand Down Expand Up @@ -39,9 +40,9 @@ def raster_to_dataframe(
:return: pandas `DataFrame` or `GeoDataFrame`
"""

df = arrow_raster.raster_to_dataframe(
df = pa.table(arrow_raster.raster_to_dataframe(
in_raster, transform, h3_resolution, nodata_value=nodata_value, axis_order=axis_order, compact=compact
).to_pandas()
)).to_pandas()

if geo:
return cells_dataframe_to_geodataframe(df)
Expand Down
2 changes: 1 addition & 1 deletion h3ronpy/python/h3ronpy/pandas/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def wrapper(*args, **kw):

@wraps(wkb_to_cells)
def geoseries_to_cells(geoseries: gpd.GeoSeries, *args, **kw):
return _av.wkb_to_cells(geoseries.to_wkb(), *args, **kw).to_pandas()
return pa.array(_av.wkb_to_cells(geoseries.to_wkb(), *args, **kw)).to_pandas()


geoseries_to_cells.__name__ = "geoseries_to_cells"
Expand Down
8 changes: 5 additions & 3 deletions h3ronpy/python/h3ronpy/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,18 @@
from functools import wraps
import typing
import polars as pl
import pyarrow as pa
from arro3.core import Array, Table
from .. import arrow as _arrow


def _wrap(func, ret_type=None):
@wraps(func, updated=())
def wrapper(*args, **kw):
result = func(*args, **kw)
if isinstance(result, pa.Table) or isinstance(result, pa.Array):
return pl.from_arrow(result)
if isinstance(result, Array):
return pl.Series(result)
elif isinstance(result, Table):
return pl.DataFrame(result)
return result

if ret_type:
Expand Down
Loading
Loading