Skip to content

Commit

Permalink
Re-structure module more logically in prepation for accessors
Browse files Browse the repository at this point in the history
  • Loading branch information
rhugonnet committed Nov 14, 2024
1 parent b98f716 commit d9a92f2
Show file tree
Hide file tree
Showing 35 changed files with 3,868 additions and 3,382 deletions.
2 changes: 1 addition & 1 deletion geoutils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from geoutils import examples, projtools, raster, vector # noqa
from geoutils._config import config # noqa
from geoutils.raster import Mask, Raster, SatelliteImage # noqa
from geoutils.vector import Vector # noqa
from geoutils.vector.vector import Vector # noqa

try:
from geoutils._version import __version__ as __version__ # noqa
Expand Down
5 changes: 5 additions & 0 deletions geoutils/interface/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from geoutils.interface.raster_point import * # noqa
from geoutils.interface.raster_vector import * # noqa
from geoutils.interface.gridding import * # noqa
from geoutils.interface.interpolate import * # noqa
from geoutils.interface.distance import * # noqa
86 changes: 86 additions & 0 deletions geoutils/interface/distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Functionalities related to distance operations."""
from __future__ import annotations

from typing import Literal
import warnings

import numpy as np
import geopandas as gpd
from scipy.ndimage import distance_transform_edt

import geoutils as gu
from geoutils._typing import NDArrayNum

def _proximity_from_vector_or_raster(
raster: gu.Raster,
vector: gu.Vector | None = None,
target_values: list[float] | None = None,
geometry_type: str = "boundary",
in_or_out: Literal["in"] | Literal["out"] | Literal["both"] = "both",
distance_unit: Literal["pixel"] | Literal["georeferenced"] = "georeferenced",
) -> NDArrayNum:
"""
(This function is defined here as mostly raster-based, but used in a class method for both Raster and Vector)
Proximity to a Raster's target values if no Vector is provided, otherwise to a Vector's geometry type
rasterized on the Raster.
:param raster: Raster to burn the proximity grid on.
:param vector: Vector for which to compute the proximity to geometry,
if not provided computed on the Raster target pixels.
:param target_values: (Only with a Raster) List of target values to use for the proximity,
defaults to all non-zero values.
:param geometry_type: (Only with a Vector) Type of geometry to use for the proximity, defaults to 'boundary'.
:param in_or_out: (Only with a Vector) Compute proximity only 'in' or 'out'-side the geometry, or 'both'.
:param distance_unit: Distance unit, either 'georeferenced' or 'pixel'.
"""

# 1/ First, if there is a vector input, we rasterize the geometry type
# (works with .boundary that is a LineString (.exterior exists, but is a LinearRing)
if vector is not None:

# TODO: Only when using centroid... Maybe we should leave this operation to the user anyway?
warnings.filterwarnings("ignore", message="Geometry is in a geographic CRS.*")

# We create a geodataframe with the geometry type
boundary_shp = gpd.GeoDataFrame(geometry=vector.ds.__getattr__(geometry_type), crs=vector.crs)
# We mask the pixels that make up the geometry type
mask_boundary = gu.Vector(boundary_shp).create_mask(raster, as_array=True)

else:
# We mask target pixels
if target_values is not None:
mask_boundary = np.logical_or.reduce([raster.get_nanarray() == target_val for target_val in target_values])
# Otherwise, all non-zero values are considered targets
else:
mask_boundary = raster.get_nanarray().astype(bool)

# 2/ Now, we compute the distance matrix relative to the masked geometry type
if distance_unit.lower() == "georeferenced":
sampling: int | tuple[float | int, float | int] = raster.res
elif distance_unit.lower() == "pixel":
sampling = 1
else:
raise ValueError('Distance unit must be either "georeferenced" or "pixel".')

# If not all pixels are targets, then we compute the distance
non_targets = np.count_nonzero(mask_boundary)
if non_targets > 0:
proximity = distance_transform_edt(~mask_boundary, sampling=sampling)
# Otherwise, pass an array full of nodata
else:
proximity = np.ones(np.shape(mask_boundary)) * np.nan

# 3/ If there was a vector input, apply the in_and_out argument to optionally mask inside/outside
if vector is not None:
if in_or_out == "both":
pass
elif in_or_out in ["in", "out"]:
mask_polygon = gu.Vector(vector.ds).create_mask(raster, as_array=True)
if in_or_out == "in":
proximity[~mask_polygon] = 0
else:
proximity[mask_polygon] = 0
else:
raise ValueError('The type of proximity must be one of "in", "out" or "both".')

return proximity
2 changes: 1 addition & 1 deletion geoutils/pointcloud.py → geoutils/interface/gridding.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Module for point cloud manipulation."""
"""Functionalities for gridding points (point cloud to raster)."""

import warnings
from typing import Literal
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Functionalities for interpolating a regular grid at points (raster to point cloud)."""
from __future__ import annotations

from typing import Any, Callable, Literal, overload
Expand Down
238 changes: 238 additions & 0 deletions geoutils/interface/raster_point.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
"""Functionalities at the interface of rasters and point clouds."""
from __future__ import annotations

from typing import Literal, Iterable

import numpy as np
import geopandas as gpd
import rasterio as rio

import geoutils as gu
from geoutils._typing import NDArrayNum
from geoutils.raster.georeferencing import _default_nodata, _xy2ij
from geoutils.raster.array import _get_mask_from_array
from geoutils.raster.sampling import subsample_array

def _regular_pointcloud_to_raster(
pointcloud: gpd.GeoDataFrame,
grid_coords: tuple[NDArrayNum, NDArrayNum] = None,
transform: rio.transform.Affine = None,
shape: tuple[int, int] = None,
nodata: int | float | None = None,
data_column_name: str = "b1",
area_or_point: Literal["Area", "Point"] = "Point",
):
"""
Convert a regular point cloud to a raster. See Raster.from_pointcloud_regular() for details.
"""

# Get transform and shape from input
if grid_coords is not None:

# Input checks
if (
not isinstance(grid_coords, tuple)
or not (isinstance(grid_coords[0], np.ndarray) and grid_coords[0].ndim == 1)
or not (isinstance(grid_coords[1], np.ndarray) and grid_coords[1].ndim == 1)
):
raise TypeError("Input grid coordinates must be 1D arrays.")

diff_x = np.diff(grid_coords[0])
diff_y = np.diff(grid_coords[1])

if not all(diff_x == diff_x[0]) and all(diff_y == diff_y[0]):
raise ValueError("Grid coordinates must be regular (equally spaced, independently along X and Y).")

# Build transform from min X, max Y and step in both
out_transform = rio.transform.from_origin(
np.min(grid_coords[0]), np.max(grid_coords[1]), diff_x[0], diff_y[0]
)
# Y is first axis, X is second axis
out_shape = (len(grid_coords[1]), len(grid_coords[0]))

elif transform is not None and shape is not None:

out_transform = transform
out_shape = shape

else:
raise ValueError("Either grid coordinates or both geotransform and shape must be provided.")

# Create raster from inputs, with placeholder data for now
dtype = pointcloud[data_column_name].dtype
out_nodata = nodata if not None else _default_nodata(dtype)
arr = np.ones(out_shape, dtype=dtype)

# Get indexes of point cloud coordinates in the raster, forcing no shift
i, j = _xy2ij(x=pointcloud.geometry.x.values, y=pointcloud.geometry.y.values, shift_area_or_point=False,
transform=out_transform, area_or_point=area_or_point)

# If coordinates are not integer type (forced in xy2ij), then some points are not falling on exact coordinates
if not np.issubdtype(i.dtype, np.integer) or not np.issubdtype(i.dtype, np.integer):
raise ValueError("Some point cloud coordinates differ from the grid coordinates.")

# Set values
mask = np.ones(np.shape(arr), dtype=bool)
mask[i, j] = False
arr[i, j] = pointcloud[data_column_name].values

# Set output values
raster_arr = np.ma.masked_array(data=arr, mask=mask)

return raster_arr, out_transform, pointcloud.crs, out_nodata, area_or_point

def _raster_to_pointcloud(
source_raster: gu.Raster,
data_column_name: str,
data_band: int,
auxiliary_data_bands: list[int] | None,
auxiliary_column_names: list[str] | None,
subsample: float | int,
skip_nodata: bool,
as_array: bool,
random_state: int | np.random.Generator | None,
force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"],
) -> NDArrayNum | gu.Vector:
"""
Convert a raster to a point cloud. See Raster.to_pointcloud() for details.
"""

# Input checks

# Main data column checks
if not isinstance(data_column_name, str):
raise ValueError("Data column name must be a string.")
if not (isinstance(data_band, int) and data_band >= 1 and data_band <= source_raster.count):
raise ValueError(
f"Data band number must be an integer between 1 and the total number of bands ({source_raster.count})."
)

# Rename data column if a different band is selected but the name is still default
if data_band != 1 and data_column_name == "b1":
data_column_name = "b" + str(data_band)

# Auxiliary data columns checks
if auxiliary_column_names is not None and auxiliary_data_bands is None:
raise ValueError("Passing auxiliary column names requires passing auxiliary data band numbers as well.")
if auxiliary_data_bands is not None:
if not (
isinstance(auxiliary_data_bands, Iterable) and all(isinstance(b, int) for b in auxiliary_data_bands)
):
raise ValueError("Auxiliary data band number must be an iterable containing only integers.")
if any((1 > b or source_raster.count < b) for b in auxiliary_data_bands):
raise ValueError(
f"Auxiliary data band numbers must be between 1 and the total number of bands ({source_raster.count})."
)
if data_band in auxiliary_data_bands:
raise ValueError(
f"Main data band {data_band} should not be listed in auxiliary data bands {auxiliary_data_bands}."
)

# Ensure auxiliary column name is defined if auxiliary data bands is not None
if auxiliary_column_names is not None:
if not (
isinstance(auxiliary_column_names, Iterable)
and all(isinstance(b, str) for b in auxiliary_column_names)
):
raise ValueError("Auxiliary column names must be an iterable containing only strings.")
if not len(auxiliary_column_names) == len(auxiliary_data_bands):
raise ValueError(
f"Length of auxiliary column name and data band numbers should be the same, "
f"found {len(auxiliary_column_names)} and {len(auxiliary_data_bands)} respectively."
)

else:
auxiliary_column_names = [f"b{i}" for i in auxiliary_data_bands]

# Define bigger list with all bands and names
all_bands = [data_band] + auxiliary_data_bands
all_column_names = [data_column_name] + auxiliary_column_names

else:
all_bands = [data_band]
all_column_names = [data_column_name]

# If subsample is the entire array, load it to optimize speed
if subsample == 1 and not source_raster.is_loaded:
source_raster.load(bands=all_bands)

# Band indexes in the array are band number minus one
all_indexes = [b - 1 for b in all_bands]

# We do 2D subsampling on the data band only, regardless of valid masks on other bands
if skip_nodata:
if source_raster.is_loaded:
if source_raster.count == 1:
self_mask = _get_mask_from_array(
source_raster.data
) # This is to avoid the case where the mask is just "False"
else:
self_mask = _get_mask_from_array(
source_raster.data[data_band - 1, :, :]
) # This is to avoid the case where the mask is just "False"
valid_mask = ~self_mask

# Load only mask of valid data from disk if array not loaded
else:
valid_mask = ~source_raster._load_only_mask(bands=data_band)
# If we are not skipping nodata values, valid mask is everywhere
else:
if source_raster.count == 1:
valid_mask = np.ones(source_raster.data.shape, dtype=bool)
else:
valid_mask = np.ones(source_raster.data[0, :].shape, dtype=bool)

# Get subsample on valid mask
# Build a low memory boolean masked array with invalid values masked to pass to subsampling
ma_valid = np.ma.masked_array(data=np.ones(np.shape(valid_mask), dtype=bool), mask=~valid_mask)
# Take a subsample within the valid values
indices = subsample_array(array=ma_valid, subsample=subsample, random_state=random_state, return_indices=True)

# If the Raster is loaded, pick from the data while ignoring the mask
if source_raster.is_loaded:
if source_raster.count == 1:
pixel_data = source_raster.data[indices[0], indices[1]]
else:
# TODO: Combining both indexes at once could reduce memory usage?
pixel_data = source_raster.data[all_indexes, :][:, indices[0], indices[1]]

# Otherwise use rasterio.sample to load only requested pixels
else:
# Extract the coordinates at subsampled pixels with valid data
# To extract data, we always use "upper left" which rasterio interprets as the exact raster coordinates
# Further below we redefine output coordinates based on point interpretation
x_coords, y_coords = (np.array(a) for a in source_raster.ij2xy(indices[0], indices[1], force_offset="ul"))

with rio.open(source_raster.filename) as raster:
# Rasterio uses indexes (starts at 1)
pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords), indexes=all_bands))).T

# At this point there should not be any nodata anymore, so we can transform everything to normal array
if np.ma.isMaskedArray(pixel_data):
pixel_data = pixel_data.data

# If nodata values were not skipped, convert them to NaNs and change data type
if skip_nodata is False:
pixel_data = pixel_data.astype("float32")
pixel_data[pixel_data == source_raster.nodata] = np.nan

# Now we force the coordinates we define for the point cloud, according to pixel interpretation
x_coords_2, y_coords_2 = (
np.array(a) for a in source_raster.ij2xy(indices[0], indices[1], force_offset=force_pixel_offset)
)

if not as_array:
points = gu.Vector(
gpd.GeoDataFrame(
pixel_data.T,
columns=all_column_names,
geometry=gpd.points_from_xy(x_coords_2, y_coords_2),
crs=source_raster.crs,
)
)
return points
else:
# Merge the coordinates and pixel data an array of N x K
# This has the downside of converting all the data to the same data type
points_arr = np.vstack((x_coords_2.reshape(1, -1), y_coords_2.reshape(1, -1), pixel_data)).T
return points_arr
Loading

0 comments on commit d9a92f2

Please sign in to comment.