diff --git a/doc/source/api.md b/doc/source/api.md index 10eb044c..face8593 100644 --- a/doc/source/api.md +++ b/doc/source/api.md @@ -107,6 +107,15 @@ documentation. Raster.plot ``` +### Get statistics + +```{eval-rst} +.. autosummary:: + :toctree: gen_modules/ + + Raster.get_stats +``` + ### Get or update data methods ```{eval-rst} diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index ec3d2316..ef0d0e28 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -260,14 +260,14 @@ The {func}`~geoutils.Raster.icrop` function accepts only a bounding box in pixel By default, {func}`~geoutils.Raster.crop` and {func}`~geoutils.Raster.icrop` return a new Raster unless the inplace parameter is set to True, in which case the cropping operation is performed directly on the original raster object. For more details, see the {ref}`specific section and function descriptions in the API`. -### Example for `crop` +### Example for {func}`~geoutils.Raster.crop` ```{code-cell} ipython3 # Crop raster to smaller bounds rast_crop = rast.crop(bbox=(0.3, 0.3, 1, 1)) print(rast_crop.bounds) ``` -### Example for `icrop` +### Example for {func}`~geoutils.Raster.icrop` ```{code-cell} ipython3 # Crop raster using pixel coordinates rast_icrop = rast.icrop(bbox=(2, 2, 6, 6)) @@ -360,8 +360,31 @@ rast_reproj.to_xarray() ``` ## Obtain Statistics -The `get_stats()` method allows to extract key statistical information from a raster in a dictionary. -Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, rmse, std. +The {func}`~geoutils.Raster.get_stats` method allows to extract key statistical information from a raster in a dictionary. +Supported statistics are : +- **Mean:** arithmetic mean of the data, ignoring masked values. +- **Median:** middle value when the valid data points are sorted in increasing order, ignoring masked values. +- **Max:** maximum value among the data, ignoring masked values. +- **Min:** minimum value among the data, ignoring masked values. +- **Sum:** sum of all data, ignoring masked values. +- **Sum of squares:** sum of the squares of all data, ignoring masked values. +- **90th percentile:** point below which 90% of the data falls, ignoring masked values. +- **LE90 (Linear Error with 90% confidence):** Difference between the 95th and 5th percentiles of a dataset, representing the range within which 90% of the data points lie. Ignore masked values. +- **NMAD (Normalized Median Absolute Deviation):** robust measure of variability in the data, less sensitive to outliers compared to standard deviation. Ignore masked values. +- **RMSE (Root Mean Square Error):** commonly used to express the magnitude of errors or variability and can give insight into the spread of the data. Only relevant when the raster represents a difference of two objects. Ignore masked values. +- **Std (Standard deviation):** measures the spread or dispersion of the data around the mean, ignoring masked values. +- **Valid count:** number of finite data points in the array. It counts the non-masked elements. +- **Total count:** total size of the raster. +- **Percentage valid points:** ratio between **Valid count** and **Total count**. + + +If an inlier mask is passed: +- **Total inlier count:** number of data points in the inlier mask. +- **Valid inlier count:** number of unmasked data points in the array after applying the inlier mask. +- **Percentage inlier points:** ratio between **Valid inlier count** and **Valid count**. Useful for classification statistics. +- **Percentage valid inlier points:** ratio between **Valid inlier count** and **Total inlier count**. + + Callable functions are supported as well. ### Usage Examples: @@ -377,7 +400,7 @@ rast.get_stats("mean") - Get multiple statistics in a dict: ```{code-cell} ipython3 -rast.get_stats(["mean", "max", "rmse"]) +rast.get_stats(["mean", "max", "std"]) ``` - Using a custom callable statistic: @@ -386,3 +409,11 @@ def custom_stat(data): return np.nansum(data > 100) # Count the number of pixels above 100 rast.get_stats(custom_stat) ``` + +- Passing an inlier mask: +```{code-cell} ipython3 +inlier_mask = np.array([[False, False , True], + [False, True , False], + [True, False , True]]) +rast.get_stats(inlier_mask=inlier_mask) +``` diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index a6ecde43..2746d673 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -89,7 +89,7 @@ decode_sensor_metadata, parse_and_convert_metadata_from_filename, ) -from geoutils.stats import nmad +from geoutils.stats import linear_error, nmad from geoutils.vector.vector import Vector # If python38 or above, Literal is builtin. Otherwise, use typing_extensions @@ -1890,102 +1890,114 @@ def set_mask(self, mask: NDArrayBool | Mask) -> None: else: self.data[mask_arr > 0] = np.ma.masked - def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: + def _statistics(self, band: int = 1, counts: tuple[int, int] | None = None) -> dict[str, np.floating[Any]]: """ Calculate common statistics for a specified band in the raster. :param band: The index of the band for which to compute statistics. Default is 1. + :param counts: (number of finite data points in the array, number of valid points in inlier_mask). - :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, NMAD, RMSE, and standard deviation. + :returns: A dictionary containing the calculated statistics for the selected band. """ + if self.count == 1: data = self.data else: data = self.data[band - 1] - # If data is a MaskedArray, use the compressed version (without masked values) - if isinstance(data, np.ma.MaskedArray): - data = data.compressed() - # Compute the statistics + mdata = np.ma.filled(data.astype(float), np.nan) + valid_count = np.count_nonzero(~self.get_mask()) if counts is None else counts[0] stats_dict = { - "Mean": np.nanmean(data), - "Median": np.nanmedian(data), - "Max": np.nanmax(data), - "Min": np.nanmin(data), - "Sum": np.nansum(data), - "Sum of squares": np.nansum(np.square(data)), - "90th percentile": np.nanpercentile(data, 90), + "Mean": np.ma.mean(data), + "Median": np.ma.median(data), + "Max": np.ma.max(data), + "Min": np.ma.min(data), + "Sum": np.ma.sum(data), + "Sum of squares": np.ma.sum(np.square(data)), + "90th percentile": np.nanpercentile(mdata, 90), + "LE90": linear_error(mdata, interval=90), "NMAD": nmad(data), - "RMSE": np.sqrt(np.nanmean(np.square(data - np.nanmean(data)))), - "Standard deviation": np.nanstd(data), + "RMSE": np.sqrt(np.ma.mean(np.square(data))), + "Standard deviation": np.ma.std(data), + "Valid count": valid_count, + "Total count": data.size, + "Percentage valid points": (valid_count / data.size) * 100, } + + if counts is not None: + valid_inlier_count = np.count_nonzero(~self.get_mask()) + stats_dict.update( + { + "Valid inlier count": valid_inlier_count, + "Total inlier count": counts[1], + "Percentage inlier points": (valid_inlier_count / counts[0]) * 100, + "Percentage valid inlier points": (valid_inlier_count / counts[1]) * 100, + } + ) + return stats_dict @overload def get_stats( self, - stats_name: ( - Literal["mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"] - | Callable[[NDArrayNum], np.floating[Any]] - ), + stats_name: str | Callable[[NDArrayNum], np.floating[Any]], + inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, + counts: tuple[int, int] | None = None, ) -> np.floating[Any]: ... @overload def get_stats( self, - stats_name: ( - list[ - Literal[ - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std" - ] - | Callable[[NDArrayNum], np.floating[Any]] - ] - | None - ) = None, + stats_name: list[str | Callable[[NDArrayNum], np.floating[Any]]] | None = None, + inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, + counts: tuple[int, int] | None = None, ) -> dict[str, np.floating[Any]]: ... def get_stats( self, stats_name: ( - Literal["mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"] - | Callable[[NDArrayNum], np.floating[Any]] - | list[ - Literal[ - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std" - ] - | Callable[[NDArrayNum], np.floating[Any]] - ] - | None + str | Callable[[NDArrayNum], np.floating[Any]] | list[str | Callable[[NDArrayNum], np.floating[Any]]] | None ) = None, + inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, + counts: tuple[int, int] | None = None, ) -> np.floating[Any] | dict[str, np.floating[Any]]: """ Retrieve specified statistics or all available statistics for the raster data. Allows passing custom callables to calculate custom stats. :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. - Accepted names include: - - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std" - You can also use common aliases for these names (e.g., "average", "maximum", "minimum", etc.). - Custom callables can also be provided. + Accepted names include: + `mean`, `median`, `max`, `min`, `sum`, `sum of squares`, `90th percentile`, `LE90`, `nmad`, `rmse`, + `std`, `valid count`, `total count`, `percentage valid points` and if an inlier mask is passed : + `valid inlier count`, `total inlier count`, `percentage inlier point`, `percentage valid inlier points`. + Custom callables can also be provided. + :param inlier_mask: A boolean mask to filter values for statistical calculations. :param band: The index of the band for which to compute statistics. Default is 1. - + :param counts: (number of finite data points in the array, number of valid points in inlier_mask). DO NOT USE. :returns: The requested statistic or a dictionary of statistics if multiple or all are requested. """ if not self.is_loaded: self.load() - stats_dict = self._statistics(band=band) + if inlier_mask is not None: + valid_points = np.count_nonzero(~self.get_mask()) + if isinstance(inlier_mask, Mask): + inlier_points = np.count_nonzero(~inlier_mask.data) + else: + inlier_points = np.count_nonzero(~inlier_mask) + dem_masked = self.copy() + dem_masked.set_mask(inlier_mask) + return dem_masked.get_stats(stats_name=stats_name, band=band, counts=(valid_points, inlier_points)) + stats_dict = self._statistics(band=band, counts=counts) if stats_name is None: return stats_dict # Define the metric aliases and their actual names stats_aliases = { "mean": "Mean", - "average": "Mean", "median": "Median", "max": "Max", "maximum": "Max", @@ -1994,17 +2006,28 @@ def get_stats( "sum": "Sum", "sumofsquares": "Sum of squares", "sum2": "Sum of squares", - "percentile": "90th percentile", "90thpercentile": "90th percentile", "90percentile": "90th percentile", - "percentile90": "90th percentile", + "le90": "LE90", "nmad": "NMAD", "rmse": "RMSE", + "rms": "RMSE", "std": "Standard deviation", - "stddev": "Standard deviation", - "standarddev": "Standard deviation", "standarddeviation": "Standard deviation", + "validcount": "Valid count", + "totalcount": "Total count", + "percentagevalidpoints": "Percentage valid points", } + if counts is not None: + stats_aliases.update( + { + "validinliercount": "Valid inlier count", + "totalinliercount": "Total inlier count", + "percentagevalidinlierpoints": "Percentage valid inlier points", + "percentageinlierpoints": "Percentage inlier points", + } + ) + if isinstance(stats_name, list): result = {} for name in stats_name: diff --git a/geoutils/stats.py b/geoutils/stats.py index 9ac61ca0..2d19d9b0 100644 --- a/geoutils/stats.py +++ b/geoutils/stats.py @@ -38,7 +38,31 @@ def nmad(data: NDArrayNum, nfact: float = 1.4826) -> np.floating[Any]: :returns nmad: (normalized) median absolute deviation of data. """ if isinstance(data, np.ma.masked_array): - data_arr = data.compressed() + return nfact * np.ma.median(np.abs(data - np.ma.median(data))) + return nfact * np.nanmedian(np.abs(data - np.nanmedian(data))) + + +def linear_error(data: NDArrayNum, interval: float = 90) -> np.floating[Any]: + """ + Compute the linear error (LE) for a given dataset, representing the range of differences between the upper and + lower percentiles of the data. By default, this calculates the 90% confidence interval (LE90). + + :param data: A numpy array or masked array of data, typically representing the differences (errors) in elevation or + another quantity. + :param interval: The confidence interval to compute, specified as a percentage. For example, an interval of 90 will + compute the range between the 5th and 95th percentiles (LE90). This value must be between 0 and 100. + + return: The computed linear error, which is the difference between the upper and lower percentiles. + + raises: ValueError if the `interval` is not between 0 and 100. + """ + # Validate the interval + if not (0 < interval <= 100): + raise ValueError("Interval must be between 0 and 100") + + if isinstance(data, np.ma.masked_array): + mdata = np.ma.filled(data.astype(float), np.nan) else: - data_arr = np.asarray(data) - return nfact * np.nanmedian(np.abs(data_arr - np.nanmedian(data_arr))) + mdata = data + le = np.nanpercentile(mdata, 50 + interval / 2) - np.nanpercentile(mdata, 50 - interval / 2) + return le diff --git a/tests/conftest.py b/tests/conftest.py index ec21a591..b552ee72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ """Configuration of pytest.""" -from pytest import DoctestItem +from _pytest.doctest import DoctestItem # To order test modules logically during execution diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index d464879c..84d148f6 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1985,8 +1985,6 @@ def test_split_bands(self) -> None: def test_stats(self, example: str, caplog) -> None: raster = gu.Raster(example) - # Full stats - stats = raster.get_stats() expected_stats = [ "Mean", "Median", @@ -1995,18 +1993,41 @@ def test_stats(self, example: str, caplog) -> None: "Sum", "Sum of squares", "90th percentile", + "LE90", "NMAD", "RMSE", "Standard deviation", + "Valid count", + "Total count", + "Percentage valid points", ] + + # Full stats + stats = raster.get_stats() for name in expected_stats: assert name in stats assert stats.get(name) is not None + # With mask + inlier_mask = raster.get_mask() + stats_masked = raster.get_stats(inlier_mask=inlier_mask) + for name in [ + "Valid inlier count", + "Total inlier count", + "Percentage inlier points", + "Percentage valid inlier points", + ]: + assert name in stats_masked + assert stats_masked.get(name) is not None + stats_masked.pop(name) + assert stats_masked == stats + # Single stat - stat = raster.get_stats(stats_name="Average") - assert isinstance(stat, np.floating) + for name in expected_stats: + stat = raster.get_stats(stats_name=name) + assert np.isfinite(stat) + # Callable def percentile_95(data: NDArrayNum) -> np.floating[Any]: if isinstance(data, np.ma.MaskedArray): data = data.compressed() @@ -2016,8 +2037,8 @@ def percentile_95(data: NDArrayNum) -> np.floating[Any]: assert isinstance(stat, np.floating) # Selected stats and callable - stats_name = ["mean", "maximum", "std", "percentile_95"] - stats = raster.get_stats(stats_name=["mean", "maximum", "std", percentile_95]) + stats_name = ["mean", "max", "std", "percentile_95"] + stats = raster.get_stats(stats_name=["mean", "max", "std", percentile_95]) for name in stats_name: assert name in stats assert stats.get(name) is not None diff --git a/tests/test_stats.py b/tests/test_stats.py index dbfb5118..5eaa7016 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,10 +2,11 @@ Test functions for stats """ +import numpy as np import scipy from geoutils import Raster, examples -from geoutils.stats import nmad +from geoutils.stats import linear_error, nmad class TestStats: @@ -28,3 +29,36 @@ def test_nmad(self) -> None: nmad_2 = nmad(self.landsat_raster.data, nfact=2) assert nmad_1 * 2 == nmad_2 + + def test_linear_error(self) -> None: + """Test linear error (LE) functionality runs on any type of input""" + + # Compute LE on the landsat raster data for a default interval (LE90) + le_ma = linear_error(self.landsat_raster.data) + le_nan_array = linear_error(self.landsat_raster.get_nanarray()) + + # Assert the LE90 is computed the same for masked array and NaN array + assert le_ma == le_nan_array + + # Check that the function works for different intervals + le90 = linear_error(self.landsat_raster.data, interval=90) + le50 = linear_error(self.landsat_raster.data, interval=50) + + # Verify that LE50 (interquartile range) is smaller than LE90 + assert le50 < le90 + + # Test a known dataset + test_data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + le90_test = linear_error(test_data, interval=90) + le50_test = linear_error(test_data, interval=50) + + assert le90_test == 9 + assert le50_test == 5 + + # Test masked arrays with invalid data (should ignore NaNs/masked values) + masked_data = np.ma.masked_array(test_data, mask=[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + le90_masked = linear_error(masked_data, interval=90) + le50_masked = linear_error(masked_data, interval=50) + + assert le90_masked == 4.5 + assert le50_masked == 2.5