Re-structure module more logically in prepation for accessors

GlacioHack · Nov 14, 2024 · d9a92f2 · d9a92f2
1 parent b98f716
commit d9a92f2
Show file tree

Hide file tree

Showing 35 changed files with 3,868 additions and 3,382 deletions.
diff --git a/geoutils/__init__.py b/geoutils/__init__.py
@@ -5,7 +5,7 @@
 from geoutils import examples, projtools, raster, vector  # noqa
 from geoutils._config import config  # noqa
 from geoutils.raster import Mask, Raster, SatelliteImage  # noqa
-from geoutils.vector import Vector  # noqa
+from geoutils.vector.vector import Vector  # noqa
 
 try:
     from geoutils._version import __version__ as __version__  # noqa

diff --git a/geoutils/interface/__init__.py b/geoutils/interface/__init__.py
@@ -0,0 +1,5 @@
+from geoutils.interface.raster_point import *  # noqa
+from geoutils.interface.raster_vector import *  # noqa
+from geoutils.interface.gridding import *  # noqa
+from geoutils.interface.interpolate import *  # noqa
+from geoutils.interface.distance import *  # noqa
diff --git a/geoutils/interface/distance.py b/geoutils/interface/distance.py
@@ -0,0 +1,86 @@
+"""Functionalities related to distance operations."""
+from __future__ import annotations
+
+from typing import Literal
+import warnings
+
+import numpy as np
+import geopandas as gpd
+from scipy.ndimage import distance_transform_edt
+
+import geoutils as gu
+from geoutils._typing import NDArrayNum
+
+def _proximity_from_vector_or_raster(
+    raster: gu.Raster,
+    vector: gu.Vector | None = None,
+    target_values: list[float] | None = None,
+    geometry_type: str = "boundary",
+    in_or_out: Literal["in"] | Literal["out"] | Literal["both"] = "both",
+    distance_unit: Literal["pixel"] | Literal["georeferenced"] = "georeferenced",
+) -> NDArrayNum:
+    """
+    (This function is defined here as mostly raster-based, but used in a class method for both Raster and Vector)
+    Proximity to a Raster's target values if no Vector is provided, otherwise to a Vector's geometry type
+    rasterized on the Raster.
+
+    :param raster: Raster to burn the proximity grid on.
+    :param vector: Vector for which to compute the proximity to geometry,
+        if not provided computed on the Raster target pixels.
+    :param target_values: (Only with a Raster) List of target values to use for the proximity,
+        defaults to all non-zero values.
+    :param geometry_type: (Only with a Vector) Type of geometry to use for the proximity, defaults to 'boundary'.
+    :param in_or_out: (Only with a Vector) Compute proximity only 'in' or 'out'-side the geometry, or 'both'.
+    :param distance_unit: Distance unit, either 'georeferenced' or 'pixel'.
+    """
+
+    # 1/ First, if there is a vector input, we rasterize the geometry type
+    # (works with .boundary that is a LineString (.exterior exists, but is a LinearRing)
+    if vector is not None:
+
+        # TODO: Only when using centroid... Maybe we should leave this operation to the user anyway?
+        warnings.filterwarnings("ignore", message="Geometry is in a geographic CRS.*")
+
+        # We create a geodataframe with the geometry type
+        boundary_shp = gpd.GeoDataFrame(geometry=vector.ds.__getattr__(geometry_type), crs=vector.crs)
+        # We mask the pixels that make up the geometry type
+        mask_boundary = gu.Vector(boundary_shp).create_mask(raster, as_array=True)
+
+    else:
+        # We mask target pixels
+        if target_values is not None:
+            mask_boundary = np.logical_or.reduce([raster.get_nanarray() == target_val for target_val in target_values])
+        # Otherwise, all non-zero values are considered targets
+        else:
+            mask_boundary = raster.get_nanarray().astype(bool)
+
+    # 2/ Now, we compute the distance matrix relative to the masked geometry type
+    if distance_unit.lower() == "georeferenced":
+        sampling: int | tuple[float | int, float | int] = raster.res
+    elif distance_unit.lower() == "pixel":
+        sampling = 1
+    else:
+        raise ValueError('Distance unit must be either "georeferenced" or "pixel".')
+
+    # If not all pixels are targets, then we compute the distance
+    non_targets = np.count_nonzero(mask_boundary)
+    if non_targets > 0:
+        proximity = distance_transform_edt(~mask_boundary, sampling=sampling)
+    # Otherwise, pass an array full of nodata
+    else:
+        proximity = np.ones(np.shape(mask_boundary)) * np.nan
+
+    # 3/ If there was a vector input, apply the in_and_out argument to optionally mask inside/outside
+    if vector is not None:
+        if in_or_out == "both":
+            pass
+        elif in_or_out in ["in", "out"]:
+            mask_polygon = gu.Vector(vector.ds).create_mask(raster, as_array=True)
+            if in_or_out == "in":
+                proximity[~mask_polygon] = 0
+            else:
+                proximity[mask_polygon] = 0
+        else:
+            raise ValueError('The type of proximity must be one of "in", "out" or "both".')
+
+    return proximity
diff --git a/geoutils/pointcloud.py → geoutils/interface/gridding.py b/geoutils/pointcloud.py → geoutils/interface/gridding.py
@@ -1,4 +1,4 @@
-"""Module for point cloud manipulation."""
+"""Functionalities for gridding points (point cloud to raster)."""
 
 import warnings
 from typing import Literal

diff --git a/geoutils/raster/interpolate.py → geoutils/interface/interpolate.py b/geoutils/raster/interpolate.py → geoutils/interface/interpolate.py
@@ -1,3 +1,4 @@
+"""Functionalities for interpolating a regular grid at points (raster to point cloud)."""
 from __future__ import annotations
 
 from typing import Any, Callable, Literal, overload

diff --git a/geoutils/interface/raster_point.py b/geoutils/interface/raster_point.py
@@ -0,0 +1,238 @@
+"""Functionalities at the interface of rasters and point clouds."""
+from __future__ import annotations
+
+from typing import Literal, Iterable
+
+import numpy as np
+import geopandas as gpd
+import rasterio as rio
+
+import geoutils as gu
+from geoutils._typing import NDArrayNum
+from geoutils.raster.georeferencing import _default_nodata, _xy2ij
+from geoutils.raster.array import _get_mask_from_array
+from geoutils.raster.sampling import subsample_array
+
+def _regular_pointcloud_to_raster(
+    pointcloud: gpd.GeoDataFrame,
+    grid_coords: tuple[NDArrayNum, NDArrayNum] = None,
+    transform: rio.transform.Affine = None,
+    shape: tuple[int, int] = None,
+    nodata: int | float | None = None,
+    data_column_name: str = "b1",
+    area_or_point: Literal["Area", "Point"] = "Point",
+):
+    """
+    Convert a regular point cloud to a raster. See Raster.from_pointcloud_regular() for details.
+    """
+
+    # Get transform and shape from input
+    if grid_coords is not None:
+
+        # Input checks
+        if (
+                not isinstance(grid_coords, tuple)
+                or not (isinstance(grid_coords[0], np.ndarray) and grid_coords[0].ndim == 1)
+                or not (isinstance(grid_coords[1], np.ndarray) and grid_coords[1].ndim == 1)
+        ):
+            raise TypeError("Input grid coordinates must be 1D arrays.")
+
+        diff_x = np.diff(grid_coords[0])
+        diff_y = np.diff(grid_coords[1])
+
+        if not all(diff_x == diff_x[0]) and all(diff_y == diff_y[0]):
+            raise ValueError("Grid coordinates must be regular (equally spaced, independently along X and Y).")
+
+        # Build transform from min X, max Y and step in both
+        out_transform = rio.transform.from_origin(
+            np.min(grid_coords[0]), np.max(grid_coords[1]), diff_x[0], diff_y[0]
+        )
+        # Y is first axis, X is second axis
+        out_shape = (len(grid_coords[1]), len(grid_coords[0]))
+
+    elif transform is not None and shape is not None:
+
+        out_transform = transform
+        out_shape = shape
+
+    else:
+        raise ValueError("Either grid coordinates or both geotransform and shape must be provided.")
+
+    # Create raster from inputs, with placeholder data for now
+    dtype = pointcloud[data_column_name].dtype
+    out_nodata = nodata if not None else _default_nodata(dtype)
+    arr = np.ones(out_shape, dtype=dtype)
+
+    # Get indexes of point cloud coordinates in the raster, forcing no shift
+    i, j = _xy2ij(x=pointcloud.geometry.x.values, y=pointcloud.geometry.y.values, shift_area_or_point=False,
+                  transform=out_transform, area_or_point=area_or_point)
+
+    # If coordinates are not integer type (forced in xy2ij), then some points are not falling on exact coordinates
+    if not np.issubdtype(i.dtype, np.integer) or not np.issubdtype(i.dtype, np.integer):
+        raise ValueError("Some point cloud coordinates differ from the grid coordinates.")
+
+    # Set values
+    mask = np.ones(np.shape(arr), dtype=bool)
+    mask[i, j] = False
+    arr[i, j] = pointcloud[data_column_name].values
+
+    # Set output values
+    raster_arr = np.ma.masked_array(data=arr, mask=mask)
+
+    return raster_arr, out_transform, pointcloud.crs, out_nodata, area_or_point
+
+def _raster_to_pointcloud(
+    source_raster: gu.Raster,
+    data_column_name: str,
+    data_band: int,
+    auxiliary_data_bands: list[int] | None,
+    auxiliary_column_names: list[str] | None,
+    subsample: float | int,
+    skip_nodata: bool,
+    as_array: bool,
+    random_state: int | np.random.Generator | None,
+    force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"],
+) -> NDArrayNum | gu.Vector:
+    """
+    Convert a raster to a point cloud. See Raster.to_pointcloud() for details.
+    """
+
+    # Input checks
+
+    # Main data column checks
+    if not isinstance(data_column_name, str):
+        raise ValueError("Data column name must be a string.")
+    if not (isinstance(data_band, int) and data_band >= 1 and data_band <= source_raster.count):
+        raise ValueError(
+            f"Data band number must be an integer between 1 and the total number of bands ({source_raster.count})."
+        )
+
+    # Rename data column if a different band is selected but the name is still default
+    if data_band != 1 and data_column_name == "b1":
+        data_column_name = "b" + str(data_band)
+
+    # Auxiliary data columns checks
+    if auxiliary_column_names is not None and auxiliary_data_bands is None:
+        raise ValueError("Passing auxiliary column names requires passing auxiliary data band numbers as well.")
+    if auxiliary_data_bands is not None:
+        if not (
+                isinstance(auxiliary_data_bands, Iterable) and all(isinstance(b, int) for b in auxiliary_data_bands)
+        ):
+            raise ValueError("Auxiliary data band number must be an iterable containing only integers.")
+        if any((1 > b or source_raster.count < b) for b in auxiliary_data_bands):
+            raise ValueError(
+                f"Auxiliary data band numbers must be between 1 and the total number of bands ({source_raster.count})."
+            )
+        if data_band in auxiliary_data_bands:
+            raise ValueError(
+                f"Main data band {data_band} should not be listed in auxiliary data bands {auxiliary_data_bands}."
+            )
+
+        # Ensure auxiliary column name is defined if auxiliary data bands is not None
+        if auxiliary_column_names is not None:
+            if not (
+                    isinstance(auxiliary_column_names, Iterable)
+                    and all(isinstance(b, str) for b in auxiliary_column_names)
+            ):
+                raise ValueError("Auxiliary column names must be an iterable containing only strings.")
+            if not len(auxiliary_column_names) == len(auxiliary_data_bands):
+                raise ValueError(
+                    f"Length of auxiliary column name and data band numbers should be the same, "
+                    f"found {len(auxiliary_column_names)} and {len(auxiliary_data_bands)} respectively."
+                )
+
+        else:
+            auxiliary_column_names = [f"b{i}" for i in auxiliary_data_bands]
+
+        # Define bigger list with all bands and names
+        all_bands = [data_band] + auxiliary_data_bands
+        all_column_names = [data_column_name] + auxiliary_column_names
+
+    else:
+        all_bands = [data_band]
+        all_column_names = [data_column_name]
+
+    # If subsample is the entire array, load it to optimize speed
+    if subsample == 1 and not source_raster.is_loaded:
+        source_raster.load(bands=all_bands)
+
+    # Band indexes in the array are band number minus one
+    all_indexes = [b - 1 for b in all_bands]
+
+    # We do 2D subsampling on the data band only, regardless of valid masks on other bands
+    if skip_nodata:
+        if source_raster.is_loaded:
+            if source_raster.count == 1:
+                self_mask = _get_mask_from_array(
+                    source_raster.data
+                )  # This is to avoid the case where the mask is just "False"
+            else:
+                self_mask = _get_mask_from_array(
+                    source_raster.data[data_band - 1, :, :]
+                )  # This is to avoid the case where the mask is just "False"
+            valid_mask = ~self_mask
+
+        # Load only mask of valid data from disk if array not loaded
+        else:
+            valid_mask = ~source_raster._load_only_mask(bands=data_band)
+    # If we are not skipping nodata values, valid mask is everywhere
+    else:
+        if source_raster.count == 1:
+            valid_mask = np.ones(source_raster.data.shape, dtype=bool)
+        else:
+            valid_mask = np.ones(source_raster.data[0, :].shape, dtype=bool)
+
+    # Get subsample on valid mask
+    # Build a low memory boolean masked array with invalid values masked to pass to subsampling
+    ma_valid = np.ma.masked_array(data=np.ones(np.shape(valid_mask), dtype=bool), mask=~valid_mask)
+    # Take a subsample within the valid values
+    indices = subsample_array(array=ma_valid, subsample=subsample, random_state=random_state, return_indices=True)
+
+    # If the Raster is loaded, pick from the data while ignoring the mask
+    if source_raster.is_loaded:
+        if source_raster.count == 1:
+            pixel_data = source_raster.data[indices[0], indices[1]]
+        else:
+            # TODO: Combining both indexes at once could reduce memory usage?
+            pixel_data = source_raster.data[all_indexes, :][:, indices[0], indices[1]]
+
+    # Otherwise use rasterio.sample to load only requested pixels
+    else:
+        # Extract the coordinates at subsampled pixels with valid data
+        # To extract data, we always use "upper left" which rasterio interprets as the exact raster coordinates
+        # Further below we redefine output coordinates based on point interpretation
+        x_coords, y_coords = (np.array(a) for a in source_raster.ij2xy(indices[0], indices[1], force_offset="ul"))
+
+        with rio.open(source_raster.filename) as raster:
+            # Rasterio uses indexes (starts at 1)
+            pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords), indexes=all_bands))).T
+
+    # At this point there should not be any nodata anymore, so we can transform everything to normal array
+    if np.ma.isMaskedArray(pixel_data):
+        pixel_data = pixel_data.data
+
+    # If nodata values were not skipped, convert them to NaNs and change data type
+    if skip_nodata is False:
+        pixel_data = pixel_data.astype("float32")
+        pixel_data[pixel_data == source_raster.nodata] = np.nan
+
+    # Now we force the coordinates we define for the point cloud, according to pixel interpretation
+    x_coords_2, y_coords_2 = (
+        np.array(a) for a in source_raster.ij2xy(indices[0], indices[1], force_offset=force_pixel_offset)
+    )
+
+    if not as_array:
+        points = gu.Vector(
+            gpd.GeoDataFrame(
+                pixel_data.T,
+                columns=all_column_names,
+                geometry=gpd.points_from_xy(x_coords_2, y_coords_2),
+                crs=source_raster.crs,
+            )
+        )
+        return points
+    else:
+        # Merge the coordinates and pixel data an array of N x K
+        # This has the downside of converting all the data to the same data type
+        points_arr = np.vstack((x_coords_2.reshape(1, -1), y_coords_2.reshape(1, -1), pixel_data)).T
+        return points_arr