From 071ca8e304da31a4ad376c9aee79fa771c6cd46a Mon Sep 17 00:00:00 2001 From: vschaffn Date: Fri, 24 Jan 2025 11:05:12 +0100 Subject: [PATCH 01/14] feat: add percentile valid points in --- geoutils/raster/raster.py | 3 +++ tests/test_raster/test_raster.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index a6ecde43..aefe498b 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1920,6 +1920,7 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: "NMAD": nmad(data), "RMSE": np.sqrt(np.nanmean(np.square(data - np.nanmean(data)))), "Standard deviation": np.nanstd(data), + "Percentile valid points": (np.count_nonzero(~np.isnan(data)) / data.size) * 100, } return stats_dict @@ -2004,6 +2005,8 @@ def get_stats( "stddev": "Standard deviation", "standarddev": "Standard deviation", "standarddeviation": "Standard deviation", + "percentvalidpoint": "Percentile valid point", + "validpoint": "Percentile valid point", } if isinstance(stats_name, list): result = {} diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index d464879c..a4a54c6e 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1998,13 +1998,14 @@ def test_stats(self, example: str, caplog) -> None: "NMAD", "RMSE", "Standard deviation", + "Percentile valid points", ] for name in expected_stats: assert name in stats assert stats.get(name) is not None # Single stat - stat = raster.get_stats(stats_name="Average") + stat = raster.get_stats(stats_name="average") assert isinstance(stat, np.floating) def percentile_95(data: NDArrayNum) -> np.floating[Any]: From 6344f22a62b4336b64c85d2b6c1de61f1f5df11e Mon Sep 17 00:00:00 2001 From: vschaffn Date: Fri, 24 Jan 2025 16:35:43 +0100 Subject: [PATCH 02/14] fix: remove rmse from and update docstrings --- geoutils/raster/raster.py | 55 +++++++++++++++++++++++++++----- tests/test_raster/test_raster.py | 1 - 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index aefe498b..1faa8d62 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1897,7 +1897,7 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: :param band: The index of the band for which to compute statistics. Default is 1. :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, NMAD, RMSE, and standard deviation. + min, sum, sum of squares, 90th percentile, NMAD, standard deviation, and percentile valid points. """ if self.count == 1: data = self.data @@ -1918,7 +1918,6 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: "Sum of squares": np.nansum(np.square(data)), "90th percentile": np.nanpercentile(data, 90), "NMAD": nmad(data), - "RMSE": np.sqrt(np.nanmean(np.square(data - np.nanmean(data)))), "Standard deviation": np.nanstd(data), "Percentile valid points": (np.count_nonzero(~np.isnan(data)) / data.size) * 100, } @@ -1928,7 +1927,18 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: def get_stats( self, stats_name: ( - Literal["mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"] + Literal[ + "mean", + "median", + "max", + "min", + "sum", + "sum of squares", + "90th percentile", + "nmad", + "std", + "percentile valid points", + ] | Callable[[NDArrayNum], np.floating[Any]] ), band: int = 1, @@ -1940,7 +1950,16 @@ def get_stats( stats_name: ( list[ Literal[ - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std" + "mean", + "median", + "max", + "min", + "sum", + "sum of squares", + "90th percentile", + "nmad", + "std", + "percentile valid points", ] | Callable[[NDArrayNum], np.floating[Any]] ] @@ -1952,11 +1971,31 @@ def get_stats( def get_stats( self, stats_name: ( - Literal["mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"] + Literal[ + "mean", + "median", + "max", + "min", + "sum", + "sum of squares", + "90th percentile", + "nmad", + "std", + "percentile valid points", + ] | Callable[[NDArrayNum], np.floating[Any]] | list[ Literal[ - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std" + "mean", + "median", + "max", + "min", + "sum", + "sum of squares", + "90th percentile", + "nmad", + "std", + "percentile valid points", ] | Callable[[NDArrayNum], np.floating[Any]] ] @@ -1970,7 +2009,8 @@ def get_stats( :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. Accepted names include: - - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std" + - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "std", + "percentile valid points". You can also use common aliases for these names (e.g., "average", "maximum", "minimum", etc.). Custom callables can also be provided. :param band: The index of the band for which to compute statistics. Default is 1. @@ -2000,7 +2040,6 @@ def get_stats( "90percentile": "90th percentile", "percentile90": "90th percentile", "nmad": "NMAD", - "rmse": "RMSE", "std": "Standard deviation", "stddev": "Standard deviation", "standarddev": "Standard deviation", diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index a4a54c6e..068257f3 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1996,7 +1996,6 @@ def test_stats(self, example: str, caplog) -> None: "Sum of squares", "90th percentile", "NMAD", - "RMSE", "Standard deviation", "Percentile valid points", ] From 70f4a1ea57015302b90d515ded3ab0c110ef89c7 Mon Sep 17 00:00:00 2001 From: vschaffn Date: Tue, 4 Feb 2025 11:42:55 +0100 Subject: [PATCH 03/14] feat: add in stats, typing --- doc/source/raster_class.md | 5 +++-- geoutils/raster/raster.py | 33 ++++++++++++++++---------------- tests/test_raster/test_raster.py | 9 +++++---- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index ec3d2316..59c1b8e1 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -361,7 +361,8 @@ rast_reproj.to_xarray() ## Obtain Statistics The `get_stats()` method allows to extract key statistical information from a raster in a dictionary. -Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, rmse, std. +Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, std, count and +percentage valid points. Callable functions are supported as well. ### Usage Examples: @@ -377,7 +378,7 @@ rast.get_stats("mean") - Get multiple statistics in a dict: ```{code-cell} ipython3 -rast.get_stats(["mean", "max", "rmse"]) +rast.get_stats(["mean", "max", "std"]) ``` - Using a custom callable statistic: diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 1faa8d62..9542bcbe 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1897,7 +1897,7 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: :param band: The index of the band for which to compute statistics. Default is 1. :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, NMAD, standard deviation, and percentile valid points. + min, sum, sum of squares, 90th percentile, NMAD, standard deviation, count, and percentage valid points. """ if self.count == 1: data = self.data @@ -1909,6 +1909,7 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: data = data.compressed() # Compute the statistics + count = np.count_nonzero(np.isfinite(data)) stats_dict = { "Mean": np.nanmean(data), "Median": np.nanmedian(data), @@ -1919,7 +1920,8 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: "90th percentile": np.nanpercentile(data, 90), "NMAD": nmad(data), "Standard deviation": np.nanstd(data), - "Percentile valid points": (np.count_nonzero(~np.isnan(data)) / data.size) * 100, + "Valid count": count, + "Percentage valid points": (count / data.size) * 100, } return stats_dict @@ -1937,7 +1939,8 @@ def get_stats( "90th percentile", "nmad", "std", - "percentile valid points", + "count", + "percentage valid points", ] | Callable[[NDArrayNum], np.floating[Any]] ), @@ -1959,7 +1962,8 @@ def get_stats( "90th percentile", "nmad", "std", - "percentile valid points", + "count", + "percentage valid points", ] | Callable[[NDArrayNum], np.floating[Any]] ] @@ -1981,7 +1985,8 @@ def get_stats( "90th percentile", "nmad", "std", - "percentile valid points", + "count", + "percentage valid points", ] | Callable[[NDArrayNum], np.floating[Any]] | list[ @@ -1995,7 +2000,8 @@ def get_stats( "90th percentile", "nmad", "std", - "percentile valid points", + "count", + "percentage valid points", ] | Callable[[NDArrayNum], np.floating[Any]] ] @@ -2009,8 +2015,8 @@ def get_stats( :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. Accepted names include: - - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "std", - "percentile valid points". + - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "std", "count", + "percentage valid points". You can also use common aliases for these names (e.g., "average", "maximum", "minimum", etc.). Custom callables can also be provided. :param band: The index of the band for which to compute statistics. Default is 1. @@ -2026,7 +2032,6 @@ def get_stats( # Define the metric aliases and their actual names stats_aliases = { "mean": "Mean", - "average": "Mean", "median": "Median", "max": "Max", "maximum": "Max", @@ -2035,17 +2040,13 @@ def get_stats( "sum": "Sum", "sumofsquares": "Sum of squares", "sum2": "Sum of squares", - "percentile": "90th percentile", "90thpercentile": "90th percentile", "90percentile": "90th percentile", - "percentile90": "90th percentile", "nmad": "NMAD", "std": "Standard deviation", - "stddev": "Standard deviation", - "standarddev": "Standard deviation", - "standarddeviation": "Standard deviation", - "percentvalidpoint": "Percentile valid point", - "validpoint": "Percentile valid point", + "count": "Valid count", + "percentagevalidpoint": "Percentage valid point", + "validpoint": "Percentage valid point", } if isinstance(stats_name, list): result = {} diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index 068257f3..47e6ca6b 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1997,14 +1997,15 @@ def test_stats(self, example: str, caplog) -> None: "90th percentile", "NMAD", "Standard deviation", - "Percentile valid points", + "Count", + "Percentage valid points", ] for name in expected_stats: assert name in stats assert stats.get(name) is not None # Single stat - stat = raster.get_stats(stats_name="average") + stat = raster.get_stats(stats_name="mean") assert isinstance(stat, np.floating) def percentile_95(data: NDArrayNum) -> np.floating[Any]: @@ -2016,8 +2017,8 @@ def percentile_95(data: NDArrayNum) -> np.floating[Any]: assert isinstance(stat, np.floating) # Selected stats and callable - stats_name = ["mean", "maximum", "std", "percentile_95"] - stats = raster.get_stats(stats_name=["mean", "maximum", "std", percentile_95]) + stats_name = ["mean", "max", "std", "percentile_95"] + stats = raster.get_stats(stats_name=["mean", "max", "std", percentile_95]) for name in stats_name: assert name in stats assert stats.get(name) is not None From 517f56b5af873eae18cbc88cdddfe5390398751b Mon Sep 17 00:00:00 2001 From: vschaffn Date: Tue, 4 Feb 2025 16:51:01 +0100 Subject: [PATCH 04/14] feat: add counts whithout mask in stats --- doc/source/raster_class.md | 2 +- geoutils/raster/raster.py | 35 +++++++++++++++++--------------- tests/test_raster/test_raster.py | 4 +++- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index 59c1b8e1..39a693a3 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -361,7 +361,7 @@ rast_reproj.to_xarray() ## Obtain Statistics The `get_stats()` method allows to extract key statistical information from a raster in a dictionary. -Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, std, count and +Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, std, valid points and percentage valid points. Callable functions are supported as well. diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 9542bcbe..99dd95a9 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1897,19 +1897,21 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: :param band: The index of the band for which to compute statistics. Default is 1. :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, NMAD, standard deviation, count, and percentage valid points. + min, sum, sum of squares, 90th percentile, NMAD, standard deviation, valid points, and percentage valid points. """ + if self.count == 1: data = self.data else: data = self.data[band - 1] - # If data is a MaskedArray, use the compressed version (without masked values) - if isinstance(data, np.ma.MaskedArray): - data = data.compressed() + # Use the compressed version (without masked values) + valid_points_no_mask = np.count_nonzero(np.isfinite(data)) + percentage_valid_points_no_mask = valid_points_no_mask / data.size * 100 + data = data.compressed() # Compute the statistics - count = np.count_nonzero(np.isfinite(data)) + valid_points = np.count_nonzero(np.isfinite(data)) stats_dict = { "Mean": np.nanmean(data), "Median": np.nanmedian(data), @@ -1920,8 +1922,10 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: "90th percentile": np.nanpercentile(data, 90), "NMAD": nmad(data), "Standard deviation": np.nanstd(data), - "Valid count": count, - "Percentage valid points": (count / data.size) * 100, + "Valid points": valid_points, + "Percentage valid points": (valid_points / data.size) * 100, + "Valid points no mask": valid_points_no_mask, + "Percentage valid points no mask": percentage_valid_points_no_mask, } return stats_dict @@ -1939,7 +1943,7 @@ def get_stats( "90th percentile", "nmad", "std", - "count", + "valid points", "percentage valid points", ] | Callable[[NDArrayNum], np.floating[Any]] @@ -1962,7 +1966,7 @@ def get_stats( "90th percentile", "nmad", "std", - "count", + "valid points", "percentage valid points", ] | Callable[[NDArrayNum], np.floating[Any]] @@ -1985,7 +1989,7 @@ def get_stats( "90th percentile", "nmad", "std", - "count", + "valid points", "percentage valid points", ] | Callable[[NDArrayNum], np.floating[Any]] @@ -2000,7 +2004,7 @@ def get_stats( "90th percentile", "nmad", "std", - "count", + "valid points", "percentage valid points", ] | Callable[[NDArrayNum], np.floating[Any]] @@ -2015,8 +2019,8 @@ def get_stats( :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. Accepted names include: - - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "std", "count", - "percentage valid points". + - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "std", + "valid points", "percentage valid points". You can also use common aliases for these names (e.g., "average", "maximum", "minimum", etc.). Custom callables can also be provided. :param band: The index of the band for which to compute statistics. Default is 1. @@ -2044,9 +2048,8 @@ def get_stats( "90percentile": "90th percentile", "nmad": "NMAD", "std": "Standard deviation", - "count": "Valid count", - "percentagevalidpoint": "Percentage valid point", - "validpoint": "Percentage valid point", + "validpoints": "Valid points", + "percentagevalidpoints": "Percentage valid point", } if isinstance(stats_name, list): result = {} diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index 47e6ca6b..9e50e9cb 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1997,8 +1997,10 @@ def test_stats(self, example: str, caplog) -> None: "90th percentile", "NMAD", "Standard deviation", - "Count", + "Valid points", "Percentage valid points", + "Valid points no mask", + "Percentage valid points no mask", ] for name in expected_stats: assert name in stats From f0035aadbcb2ada0a50fb46b7e57eee796a4c756 Mon Sep 17 00:00:00 2001 From: vschaffn Date: Mon, 10 Feb 2025 16:53:13 +0100 Subject: [PATCH 05/14] feat: add rmse in raster stats --- doc/source/raster_class.md | 5 +++-- geoutils/raster/raster.py | 13 ++++++++++--- tests/test_raster/test_raster.py | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index 39a693a3..d31ef5a9 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -361,8 +361,9 @@ rast_reproj.to_xarray() ## Obtain Statistics The `get_stats()` method allows to extract key statistical information from a raster in a dictionary. -Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, std, valid points and -percentage valid points. +Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, rmse, std, valid points +and percentage valid points. +The RMSE is only relevant when the raster represents a difference of two objects. Callable functions are supported as well. ### Usage Examples: diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 99dd95a9..e37d0d76 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1897,7 +1897,8 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: :param band: The index of the band for which to compute statistics. Default is 1. :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, NMAD, standard deviation, valid points, and percentage valid points. + min, sum, sum of squares, 90th percentile, NMAD, RMSE, standard deviation, valid points, and percentage + valid points. """ if self.count == 1: @@ -1921,6 +1922,7 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: "Sum of squares": np.nansum(np.square(data)), "90th percentile": np.nanpercentile(data, 90), "NMAD": nmad(data), + "RMSE": np.sqrt(np.nanmean(np.square(data))), "Standard deviation": np.nanstd(data), "Valid points": valid_points, "Percentage valid points": (valid_points / data.size) * 100, @@ -1942,6 +1944,7 @@ def get_stats( "sum of squares", "90th percentile", "nmad", + "rmse", "std", "valid points", "percentage valid points", @@ -1965,6 +1968,7 @@ def get_stats( "sum of squares", "90th percentile", "nmad", + "rmse", "std", "valid points", "percentage valid points", @@ -1988,6 +1992,7 @@ def get_stats( "sum of squares", "90th percentile", "nmad", + "rmse", "std", "valid points", "percentage valid points", @@ -2003,6 +2008,7 @@ def get_stats( "sum of squares", "90th percentile", "nmad", + "rmse", "std", "valid points", "percentage valid points", @@ -2019,9 +2025,8 @@ def get_stats( :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. Accepted names include: - - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "std", + - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std", "valid points", "percentage valid points". - You can also use common aliases for these names (e.g., "average", "maximum", "minimum", etc.). Custom callables can also be provided. :param band: The index of the band for which to compute statistics. Default is 1. @@ -2047,6 +2052,8 @@ def get_stats( "90thpercentile": "90th percentile", "90percentile": "90th percentile", "nmad": "NMAD", + "rmse": "RMSE", + "rms": "RMSE", "std": "Standard deviation", "validpoints": "Valid points", "percentagevalidpoints": "Percentage valid point", diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index 9e50e9cb..b3c70208 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1996,6 +1996,7 @@ def test_stats(self, example: str, caplog) -> None: "Sum of squares", "90th percentile", "NMAD", + "RMSE", "Standard deviation", "Valid points", "Percentage valid points", From d186d68975fc4fc22e9e31a1ada1768f8b1184f7 Mon Sep 17 00:00:00 2001 From: vschaffn Date: Mon, 10 Feb 2025 16:55:38 +0100 Subject: [PATCH 06/14] fix doctestitem --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index ec21a591..b552ee72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ """Configuration of pytest.""" -from pytest import DoctestItem +from _pytest.doctest import DoctestItem # To order test modules logically during execution From 22c981892b0f59d2516af5a6ddd39f1156c119e1 Mon Sep 17 00:00:00 2001 From: vschaffn Date: Tue, 11 Feb 2025 14:39:44 +0100 Subject: [PATCH 07/14] fix: correct valid points in stats, correct typing --- doc/source/raster_class.md | 4 +- geoutils/raster/raster.py | 117 +++++++------------------------ tests/test_raster/test_raster.py | 15 ++-- 3 files changed, 36 insertions(+), 100 deletions(-) diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index d31ef5a9..59feeb38 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -361,8 +361,8 @@ rast_reproj.to_xarray() ## Obtain Statistics The `get_stats()` method allows to extract key statistical information from a raster in a dictionary. -Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, rmse, std, valid points -and percentage valid points. +Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, rmse, std, valid points, +percentage valid points, valid points all data, percentage valid points all data. The RMSE is only relevant when the raster represents a difference of two objects. Callable functions are supported as well. diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index e37d0d76..5e72ced7 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1897,8 +1897,8 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: :param band: The index of the band for which to compute statistics. Default is 1. :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, NMAD, RMSE, standard deviation, valid points, and percentage - valid points. + min, sum, sum of squares, 90th percentile, NMAD, RMSE, standard deviation, valid points, percentage + valid points, valid points all data, percentage valid points all data. """ if self.count == 1: @@ -1906,116 +1906,46 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: else: data = self.data[band - 1] - # Use the compressed version (without masked values) - valid_points_no_mask = np.count_nonzero(np.isfinite(data)) - percentage_valid_points_no_mask = valid_points_no_mask / data.size * 100 - data = data.compressed() - # Compute the statistics - valid_points = np.count_nonzero(np.isfinite(data)) + compressed_data = data.compressed() + valid_points = np.count_nonzero(np.isfinite(compressed_data)) + valid_points_all_data = np.count_nonzero(np.isfinite(data)) stats_dict = { - "Mean": np.nanmean(data), - "Median": np.nanmedian(data), - "Max": np.nanmax(data), - "Min": np.nanmin(data), - "Sum": np.nansum(data), - "Sum of squares": np.nansum(np.square(data)), - "90th percentile": np.nanpercentile(data, 90), - "NMAD": nmad(data), - "RMSE": np.sqrt(np.nanmean(np.square(data))), - "Standard deviation": np.nanstd(data), - "Valid points": valid_points, + "Mean": np.nanmean(compressed_data), + "Median": np.nanmedian(compressed_data), + "Max": np.nanmax(compressed_data), + "Min": np.nanmin(compressed_data), + "Sum": np.nansum(compressed_data), + "Sum of squares": np.nansum(np.square(compressed_data)), + "90th percentile": np.nanpercentile(compressed_data, 90), + "NMAD": nmad(compressed_data), + "RMSE": np.sqrt(np.nanmean(np.square(compressed_data))), + "Standard deviation": np.nanstd(compressed_data), + "Valid points": np.count_nonzero(np.isfinite(compressed_data)), "Percentage valid points": (valid_points / data.size) * 100, - "Valid points no mask": valid_points_no_mask, - "Percentage valid points no mask": percentage_valid_points_no_mask, + "Valid points all data": valid_points_all_data, + "Percentage valid points all data": (valid_points_all_data / data.size) * 100, } return stats_dict @overload def get_stats( self, - stats_name: ( - Literal[ - "mean", - "median", - "max", - "min", - "sum", - "sum of squares", - "90th percentile", - "nmad", - "rmse", - "std", - "valid points", - "percentage valid points", - ] - | Callable[[NDArrayNum], np.floating[Any]] - ), + stats_name: str | Callable[[NDArrayNum], np.floating[Any]], band: int = 1, ) -> np.floating[Any]: ... @overload def get_stats( self, - stats_name: ( - list[ - Literal[ - "mean", - "median", - "max", - "min", - "sum", - "sum of squares", - "90th percentile", - "nmad", - "rmse", - "std", - "valid points", - "percentage valid points", - ] - | Callable[[NDArrayNum], np.floating[Any]] - ] - | None - ) = None, + stats_name: list[str | Callable[[NDArrayNum], np.floating[Any]]] | None = None, band: int = 1, ) -> dict[str, np.floating[Any]]: ... def get_stats( self, stats_name: ( - Literal[ - "mean", - "median", - "max", - "min", - "sum", - "sum of squares", - "90th percentile", - "nmad", - "rmse", - "std", - "valid points", - "percentage valid points", - ] - | Callable[[NDArrayNum], np.floating[Any]] - | list[ - Literal[ - "mean", - "median", - "max", - "min", - "sum", - "sum of squares", - "90th percentile", - "nmad", - "rmse", - "std", - "valid points", - "percentage valid points", - ] - | Callable[[NDArrayNum], np.floating[Any]] - ] - | None + str | Callable[[NDArrayNum], np.floating[Any]] | list[str | Callable[[NDArrayNum], np.floating[Any]]] | None ) = None, band: int = 1, ) -> np.floating[Any] | dict[str, np.floating[Any]]: @@ -2055,8 +1985,11 @@ def get_stats( "rmse": "RMSE", "rms": "RMSE", "std": "Standard deviation", + "standarddeviation": "Standard deviation", "validpoints": "Valid points", - "percentagevalidpoints": "Percentage valid point", + "percentagevalidpoints": "Percentage valid points", + "validpointsalldata": "Valid points all data", + "percentagevalidpointsalldata": "Percentage valid points all data", } if isinstance(stats_name, list): result = {} diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index b3c70208..d4c7a50c 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1985,8 +1985,6 @@ def test_split_bands(self) -> None: def test_stats(self, example: str, caplog) -> None: raster = gu.Raster(example) - # Full stats - stats = raster.get_stats() expected_stats = [ "Mean", "Median", @@ -2000,17 +1998,22 @@ def test_stats(self, example: str, caplog) -> None: "Standard deviation", "Valid points", "Percentage valid points", - "Valid points no mask", - "Percentage valid points no mask", + "Valid points all data", + "Percentage valid points all data", ] + + # Full stats + stats = raster.get_stats() for name in expected_stats: assert name in stats assert stats.get(name) is not None # Single stat - stat = raster.get_stats(stats_name="mean") - assert isinstance(stat, np.floating) + for name in expected_stats: + stat = raster.get_stats(stats_name=name) + assert np.isfinite(stat) + # Callable def percentile_95(data: NDArrayNum) -> np.floating[Any]: if isinstance(data, np.ma.MaskedArray): data = data.compressed() From 122fde01d8aa49e18a8599f10b2e33e539b5254b Mon Sep 17 00:00:00 2001 From: vschaffn Date: Tue, 18 Feb 2025 11:48:23 +0100 Subject: [PATCH 08/14] fix: modify the way to handle masks in stats --- geoutils/raster/raster.py | 46 ++++++++++++++++---------------- geoutils/stats.py | 6 ++--- tests/test_raster/test_raster.py | 6 ++--- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 5e72ced7..6ae72add 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1897,8 +1897,8 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: :param band: The index of the band for which to compute statistics. Default is 1. :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, NMAD, RMSE, standard deviation, valid points, percentage - valid points, valid points all data, percentage valid points all data. + min, sum, sum of squares, 90th percentile, NMAD, RMSE, standard deviation, valid count, total count, + percentage valid points, size. """ if self.count == 1: @@ -1907,24 +1907,24 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: data = self.data[band - 1] # Compute the statistics - compressed_data = data.compressed() - valid_points = np.count_nonzero(np.isfinite(compressed_data)) - valid_points_all_data = np.count_nonzero(np.isfinite(data)) + mdata = np.ma.filled(data.astype(float), np.nan) + valid_count = np.count_nonzero(~data.mask) + total_count = np.count_nonzero(np.isfinite(data)) stats_dict = { - "Mean": np.nanmean(compressed_data), - "Median": np.nanmedian(compressed_data), - "Max": np.nanmax(compressed_data), - "Min": np.nanmin(compressed_data), - "Sum": np.nansum(compressed_data), - "Sum of squares": np.nansum(np.square(compressed_data)), - "90th percentile": np.nanpercentile(compressed_data, 90), - "NMAD": nmad(compressed_data), - "RMSE": np.sqrt(np.nanmean(np.square(compressed_data))), - "Standard deviation": np.nanstd(compressed_data), - "Valid points": np.count_nonzero(np.isfinite(compressed_data)), - "Percentage valid points": (valid_points / data.size) * 100, - "Valid points all data": valid_points_all_data, - "Percentage valid points all data": (valid_points_all_data / data.size) * 100, + "Mean": np.ma.mean(data), + "Median": np.ma.median(data), + "Max": np.ma.max(data), + "Min": np.ma.min(data), + "Sum": np.ma.sum(data), + "Sum of squares": np.ma.sum(np.square(data)), + "90th percentile": np.nanpercentile(mdata, 90), + "NMAD": nmad(data), + "RMSE": np.sqrt(np.ma.mean(np.square(data))), + "Standard deviation": np.ma.std(data), + "Valid count": valid_count, + "Total count": total_count, + "Percentage valid points": (valid_count / total_count) * 100, + "Size": data.size, } return stats_dict @@ -1956,7 +1956,7 @@ def get_stats( :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. Accepted names include: - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std", - "valid points", "percentage valid points". + "valid count", "total count", "percentage valid points", "size". Custom callables can also be provided. :param band: The index of the band for which to compute statistics. Default is 1. @@ -1986,10 +1986,10 @@ def get_stats( "rms": "RMSE", "std": "Standard deviation", "standarddeviation": "Standard deviation", - "validpoints": "Valid points", + "validcount": "Valid count", + "totalcount": "Total count", "percentagevalidpoints": "Percentage valid points", - "validpointsalldata": "Valid points all data", - "percentagevalidpointsalldata": "Percentage valid points all data", + "size": "Size", } if isinstance(stats_name, list): result = {} diff --git a/geoutils/stats.py b/geoutils/stats.py index 9ac61ca0..61e9a94d 100644 --- a/geoutils/stats.py +++ b/geoutils/stats.py @@ -38,7 +38,5 @@ def nmad(data: NDArrayNum, nfact: float = 1.4826) -> np.floating[Any]: :returns nmad: (normalized) median absolute deviation of data. """ if isinstance(data, np.ma.masked_array): - data_arr = data.compressed() - else: - data_arr = np.asarray(data) - return nfact * np.nanmedian(np.abs(data_arr - np.nanmedian(data_arr))) + return nfact * np.ma.median(np.abs(data - np.ma.median(data))) + return nfact * np.nanmedian(np.abs(data - np.nanmedian(data))) diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index d4c7a50c..e4d09aa0 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1996,10 +1996,10 @@ def test_stats(self, example: str, caplog) -> None: "NMAD", "RMSE", "Standard deviation", - "Valid points", + "Valid count", + "Total count", "Percentage valid points", - "Valid points all data", - "Percentage valid points all data", + "Size", ] # Full stats From 409ef4da9003af1a6edbaba00cf5af1f524cb6a7 Mon Sep 17 00:00:00 2001 From: vschaffn Date: Tue, 18 Feb 2025 11:49:14 +0100 Subject: [PATCH 09/14] docs: complete doc about raster.get_stats --- doc/source/api.md | 9 +++++++++ doc/source/raster_class.md | 21 +++++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/doc/source/api.md b/doc/source/api.md index 10eb044c..face8593 100644 --- a/doc/source/api.md +++ b/doc/source/api.md @@ -107,6 +107,15 @@ documentation. Raster.plot ``` +### Get statistics + +```{eval-rst} +.. autosummary:: + :toctree: gen_modules/ + + Raster.get_stats +``` + ### Get or update data methods ```{eval-rst} diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index 59feeb38..b5feaaff 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -360,10 +360,23 @@ rast_reproj.to_xarray() ``` ## Obtain Statistics -The `get_stats()` method allows to extract key statistical information from a raster in a dictionary. -Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, rmse, std, valid points, -percentage valid points, valid points all data, percentage valid points all data. -The RMSE is only relevant when the raster represents a difference of two objects. +The {func}`~geoutils.Raster.get_stats` method allows to extract key statistical information from a raster in a dictionary. +Supported statistics are : +- **Mean:** arithmetic mean of the data, ignoring masked values. +- **Median:** middle value when the valid data points are sorted in increasing order, ignoring masked values. +- **Max:** maximum value among the data, ignoring masked values. +- **Min:** minimum value among the data, ignoring masked values. +- **Sum:** sum of all data, ignoring masked values. +- **Sum of squares:** sum of the squares of all data, ignoring masked values. +- **90th percentile:** point below which 90% of the data falls, ignoring masked values. +- **NMAD (Normalized Median Absolute Deviation):** robust measure of variability in the data, less sensitive to outliers compared to standard deviation. Ignore masked values. +- **RMSE (Root Mean Square Error):** commonly used to express the magnitude of errors or variability and can give insight into the spread of the data. Only relevant when the raster represents a difference of two objects. Ignore masked values. +- **Std (Standard deviation):** Measures the spread or dispersion of the data around the mean, ignoring masked values. +- **Valid count:** The number of valid (unmasked) data points in the array. It counts the non-masked elements. +- **Total count:** The total number of finite data points in the array, including masked elements. +- **Percentage valid points:** The percentage of unmasked data points out of the total number of finite points. Useful for classification statistics. +- **Size:** Total size of the raster. + Callable functions are supported as well. ### Usage Examples: From b401b541f929b20accedd5a7ba58efba9ff88f21 Mon Sep 17 00:00:00 2001 From: vschaffn Date: Fri, 21 Feb 2025 10:53:50 +0100 Subject: [PATCH 10/14] feat: add LE90 in stats --- doc/source/raster_class.md | 1 + geoutils/raster/raster.py | 10 ++++++---- geoutils/stats.py | 26 ++++++++++++++++++++++++++ tests/test_stats.py | 36 +++++++++++++++++++++++++++++++++++- 4 files changed, 68 insertions(+), 5 deletions(-) diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index b5feaaff..b55f2110 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -369,6 +369,7 @@ Supported statistics are : - **Sum:** sum of all data, ignoring masked values. - **Sum of squares:** sum of the squares of all data, ignoring masked values. - **90th percentile:** point below which 90% of the data falls, ignoring masked values. +- **LE90 (Linear Error with 90% confidence):** Difference between the 95th and 5th percentiles of a dataset, representing the range within which 90% of the data points lie. Ignore masked values. - **NMAD (Normalized Median Absolute Deviation):** robust measure of variability in the data, less sensitive to outliers compared to standard deviation. Ignore masked values. - **RMSE (Root Mean Square Error):** commonly used to express the magnitude of errors or variability and can give insight into the spread of the data. Only relevant when the raster represents a difference of two objects. Ignore masked values. - **Std (Standard deviation):** Measures the spread or dispersion of the data around the mean, ignoring masked values. diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 6ae72add..eff18409 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -89,7 +89,7 @@ decode_sensor_metadata, parse_and_convert_metadata_from_filename, ) -from geoutils.stats import nmad +from geoutils.stats import linear_error, nmad from geoutils.vector.vector import Vector # If python38 or above, Literal is builtin. Otherwise, use typing_extensions @@ -1897,7 +1897,7 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: :param band: The index of the band for which to compute statistics. Default is 1. :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, NMAD, RMSE, standard deviation, valid count, total count, + min, sum, sum of squares, 90th percentile, LE90, NMAD, RMSE, standard deviation, valid count, total count, percentage valid points, size. """ @@ -1918,6 +1918,7 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: "Sum": np.ma.sum(data), "Sum of squares": np.ma.sum(np.square(data)), "90th percentile": np.nanpercentile(mdata, 90), + "LE90": linear_error(mdata, interval=90), "NMAD": nmad(data), "RMSE": np.sqrt(np.ma.mean(np.square(data))), "Standard deviation": np.ma.std(data), @@ -1955,8 +1956,8 @@ def get_stats( :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. Accepted names include: - - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std", - "valid count", "total count", "percentage valid points", "size". + - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "LE90", "nmad", "rmse", + "std", "valid count", "total count", "percentage valid points", "size". Custom callables can also be provided. :param band: The index of the band for which to compute statistics. Default is 1. @@ -1981,6 +1982,7 @@ def get_stats( "sum2": "Sum of squares", "90thpercentile": "90th percentile", "90percentile": "90th percentile", + "le90": "LE90", "nmad": "NMAD", "rmse": "RMSE", "rms": "RMSE", diff --git a/geoutils/stats.py b/geoutils/stats.py index 61e9a94d..2d19d9b0 100644 --- a/geoutils/stats.py +++ b/geoutils/stats.py @@ -40,3 +40,29 @@ def nmad(data: NDArrayNum, nfact: float = 1.4826) -> np.floating[Any]: if isinstance(data, np.ma.masked_array): return nfact * np.ma.median(np.abs(data - np.ma.median(data))) return nfact * np.nanmedian(np.abs(data - np.nanmedian(data))) + + +def linear_error(data: NDArrayNum, interval: float = 90) -> np.floating[Any]: + """ + Compute the linear error (LE) for a given dataset, representing the range of differences between the upper and + lower percentiles of the data. By default, this calculates the 90% confidence interval (LE90). + + :param data: A numpy array or masked array of data, typically representing the differences (errors) in elevation or + another quantity. + :param interval: The confidence interval to compute, specified as a percentage. For example, an interval of 90 will + compute the range between the 5th and 95th percentiles (LE90). This value must be between 0 and 100. + + return: The computed linear error, which is the difference between the upper and lower percentiles. + + raises: ValueError if the `interval` is not between 0 and 100. + """ + # Validate the interval + if not (0 < interval <= 100): + raise ValueError("Interval must be between 0 and 100") + + if isinstance(data, np.ma.masked_array): + mdata = np.ma.filled(data.astype(float), np.nan) + else: + mdata = data + le = np.nanpercentile(mdata, 50 + interval / 2) - np.nanpercentile(mdata, 50 - interval / 2) + return le diff --git a/tests/test_stats.py b/tests/test_stats.py index dbfb5118..5eaa7016 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,10 +2,11 @@ Test functions for stats """ +import numpy as np import scipy from geoutils import Raster, examples -from geoutils.stats import nmad +from geoutils.stats import linear_error, nmad class TestStats: @@ -28,3 +29,36 @@ def test_nmad(self) -> None: nmad_2 = nmad(self.landsat_raster.data, nfact=2) assert nmad_1 * 2 == nmad_2 + + def test_linear_error(self) -> None: + """Test linear error (LE) functionality runs on any type of input""" + + # Compute LE on the landsat raster data for a default interval (LE90) + le_ma = linear_error(self.landsat_raster.data) + le_nan_array = linear_error(self.landsat_raster.get_nanarray()) + + # Assert the LE90 is computed the same for masked array and NaN array + assert le_ma == le_nan_array + + # Check that the function works for different intervals + le90 = linear_error(self.landsat_raster.data, interval=90) + le50 = linear_error(self.landsat_raster.data, interval=50) + + # Verify that LE50 (interquartile range) is smaller than LE90 + assert le50 < le90 + + # Test a known dataset + test_data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + le90_test = linear_error(test_data, interval=90) + le50_test = linear_error(test_data, interval=50) + + assert le90_test == 9 + assert le50_test == 5 + + # Test masked arrays with invalid data (should ignore NaNs/masked values) + masked_data = np.ma.masked_array(test_data, mask=[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + le90_masked = linear_error(masked_data, interval=90) + le50_masked = linear_error(masked_data, interval=50) + + assert le90_masked == 4.5 + assert le50_masked == 2.5 From 1041f540d643061f71d614a232e0e4526fd64004 Mon Sep 17 00:00:00 2001 From: vschaffn Date: Mon, 24 Feb 2025 14:13:13 +0100 Subject: [PATCH 11/14] docs: update --- doc/source/raster_class.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index b55f2110..61cdd366 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -260,14 +260,14 @@ The {func}`~geoutils.Raster.icrop` function accepts only a bounding box in pixel By default, {func}`~geoutils.Raster.crop` and {func}`~geoutils.Raster.icrop` return a new Raster unless the inplace parameter is set to True, in which case the cropping operation is performed directly on the original raster object. For more details, see the {ref}`specific section and function descriptions in the API`. -### Example for `crop` +### Example for {func}`~geoutils.Raster.crop` ```{code-cell} ipython3 # Crop raster to smaller bounds rast_crop = rast.crop(bbox=(0.3, 0.3, 1, 1)) print(rast_crop.bounds) ``` -### Example for `icrop` +### Example for {func}`~geoutils.Raster.icrop` ```{code-cell} ipython3 # Crop raster using pixel coordinates rast_icrop = rast.icrop(bbox=(2, 2, 6, 6)) From 8192338b164e35f5e46c073423847a7f4f98ba7f Mon Sep 17 00:00:00 2001 From: vschaffn Date: Mon, 24 Feb 2025 15:03:45 +0100 Subject: [PATCH 12/14] feat: add mask parameter in get_stats() --- geoutils/raster/raster.py | 23 ++++++++++++++++++----- tests/test_raster/test_raster.py | 6 ++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index eff18409..410331a3 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1890,7 +1890,7 @@ def set_mask(self, mask: NDArrayBool | Mask) -> None: else: self.data[mask_arr > 0] = np.ma.masked - def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: + def _statistics(self, band: int = 1, total_count: int | None = None) -> dict[str, np.floating[Any]]: """ Calculate common statistics for a specified band in the raster. @@ -1908,8 +1908,9 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: # Compute the statistics mdata = np.ma.filled(data.astype(float), np.nan) - valid_count = np.count_nonzero(~data.mask) - total_count = np.count_nonzero(np.isfinite(data)) + valid_count = np.count_nonzero(~self.get_mask()) + if total_count is None: + total_count = valid_count stats_dict = { "Mean": np.ma.mean(data), "Median": np.ma.median(data), @@ -1933,14 +1934,18 @@ def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]: def get_stats( self, stats_name: str | Callable[[NDArrayNum], np.floating[Any]], + inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, + total_count: int | None = None, ) -> np.floating[Any]: ... @overload def get_stats( self, stats_name: list[str | Callable[[NDArrayNum], np.floating[Any]]] | None = None, + inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, + total_count: int | None = None, ) -> dict[str, np.floating[Any]]: ... def get_stats( @@ -1948,7 +1953,9 @@ def get_stats( stats_name: ( str | Callable[[NDArrayNum], np.floating[Any]] | list[str | Callable[[NDArrayNum], np.floating[Any]]] | None ) = None, + inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, + total_count: int | None = None, ) -> np.floating[Any] | dict[str, np.floating[Any]]: """ Retrieve specified statistics or all available statistics for the raster data. Allows passing custom callables @@ -1959,13 +1966,19 @@ def get_stats( - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "LE90", "nmad", "rmse", "std", "valid count", "total count", "percentage valid points", "size". Custom callables can also be provided. + :param inlier_mask: A boolean mask to filter values for statistical calculations. :param band: The index of the band for which to compute statistics. Default is 1. - + :param total_count: The total number of finite data points in the array. :returns: The requested statistic or a dictionary of statistics if multiple or all are requested. """ if not self.is_loaded: self.load() - stats_dict = self._statistics(band=band) + if inlier_mask is not None: + total_count = np.count_nonzero(~self.get_mask()) + dem_masked = self.copy() + dem_masked.set_mask(inlier_mask) + return dem_masked.get_stats(stats_name=stats_name, band=band, total_count=total_count) + stats_dict = self._statistics(band=band, total_count=total_count) if stats_name is None: return stats_dict diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index e4d09aa0..dda0c3e3 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1993,6 +1993,7 @@ def test_stats(self, example: str, caplog) -> None: "Sum", "Sum of squares", "90th percentile", + "LE90", "NMAD", "RMSE", "Standard deviation", @@ -2008,6 +2009,11 @@ def test_stats(self, example: str, caplog) -> None: assert name in stats assert stats.get(name) is not None + # With mask + inlier_mask = raster.get_mask() + stats_masked = raster.get_stats(inlier_mask=inlier_mask) + assert stats_masked == stats + # Single stat for name in expected_stats: stat = raster.get_stats(stats_name=name) From 8f78710020583c507a9ade39d6e63f11c4d1ebc6 Mon Sep 17 00:00:00 2001 From: vschaffn Date: Tue, 25 Feb 2025 17:36:44 +0100 Subject: [PATCH 13/14] fix: disable total_count and percentage valid points in get_stats() if no inlier mask is passed --- doc/source/raster_class.md | 15 +++++++++++-- geoutils/raster/raster.py | 36 ++++++++++++++++++++------------ tests/test_raster/test_raster.py | 6 ++++-- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index 61cdd366..f0227be6 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -374,10 +374,13 @@ Supported statistics are : - **RMSE (Root Mean Square Error):** commonly used to express the magnitude of errors or variability and can give insight into the spread of the data. Only relevant when the raster represents a difference of two objects. Ignore masked values. - **Std (Standard deviation):** Measures the spread or dispersion of the data around the mean, ignoring masked values. - **Valid count:** The number of valid (unmasked) data points in the array. It counts the non-masked elements. -- **Total count:** The total number of finite data points in the array, including masked elements. -- **Percentage valid points:** The percentage of unmasked data points out of the total number of finite points. Useful for classification statistics. - **Size:** Total size of the raster. +If an inlier mask is passed: +- **Total count:** The number of unmasked data points in the array before applying the inlier mask. +- **Percentage valid points:** The ration between `valid_count` and `total_count` Useful for classification statistics. + + Callable functions are supported as well. ### Usage Examples: @@ -402,3 +405,11 @@ def custom_stat(data): return np.nansum(data > 100) # Count the number of pixels above 100 rast.get_stats(custom_stat) ``` + +- Passing an inlier mask: +```{code-cell} ipython3 +inlier_mask = np.array([[False, False , True], + [False, True , False], + [True, False , True]]) +rast.get_stats(inlier_mask=inlier_mask) +``` diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 410331a3..9fc67363 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1896,9 +1896,7 @@ def _statistics(self, band: int = 1, total_count: int | None = None) -> dict[str :param band: The index of the band for which to compute statistics. Default is 1. - :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max, - min, sum, sum of squares, 90th percentile, LE90, NMAD, RMSE, standard deviation, valid count, total count, - percentage valid points, size. + :returns: A dictionary containing the calculated statistics for the selected band. """ if self.count == 1: @@ -1909,8 +1907,6 @@ def _statistics(self, band: int = 1, total_count: int | None = None) -> dict[str # Compute the statistics mdata = np.ma.filled(data.astype(float), np.nan) valid_count = np.count_nonzero(~self.get_mask()) - if total_count is None: - total_count = valid_count stats_dict = { "Mean": np.ma.mean(data), "Median": np.ma.median(data), @@ -1924,10 +1920,17 @@ def _statistics(self, band: int = 1, total_count: int | None = None) -> dict[str "RMSE": np.sqrt(np.ma.mean(np.square(data))), "Standard deviation": np.ma.std(data), "Valid count": valid_count, - "Total count": total_count, - "Percentage valid points": (valid_count / total_count) * 100, "Size": data.size, } + + if total_count is not None: + stats_dict.update( + { + "Total count": total_count, + "Percentage valid points": (valid_count / total_count) * 100, + } + ) + return stats_dict @overload @@ -1962,10 +1965,11 @@ def get_stats( to calculate custom stats. :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. - Accepted names include: - - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "LE90", "nmad", "rmse", - "std", "valid count", "total count", "percentage valid points", "size". - Custom callables can also be provided. + Accepted names include: + `mean`, `median`, `max`, `min`, `sum`, `sum of squares`, `90th percentile`, `LE90`, `nmad`, `rmse`, + `std`, `valid count`, `size`, and if an inlier mask is passed : `total count`, + `percentage valid points`. + Custom callables can also be provided. :param inlier_mask: A boolean mask to filter values for statistical calculations. :param band: The index of the band for which to compute statistics. Default is 1. :param total_count: The total number of finite data points in the array. @@ -2002,10 +2006,16 @@ def get_stats( "std": "Standard deviation", "standarddeviation": "Standard deviation", "validcount": "Valid count", - "totalcount": "Total count", - "percentagevalidpoints": "Percentage valid points", "size": "Size", } + if total_count is not None: + stats_aliases.update( + { + "totalcount": "Total count", + "percentagevalidpoints": "Percentage valid points", + } + ) + if isinstance(stats_name, list): result = {} for name in stats_name: diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index dda0c3e3..963f6e9b 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1998,8 +1998,6 @@ def test_stats(self, example: str, caplog) -> None: "RMSE", "Standard deviation", "Valid count", - "Total count", - "Percentage valid points", "Size", ] @@ -2012,6 +2010,10 @@ def test_stats(self, example: str, caplog) -> None: # With mask inlier_mask = raster.get_mask() stats_masked = raster.get_stats(inlier_mask=inlier_mask) + for name in ["Total count", "Percentage valid points"]: + assert name in stats_masked + assert stats_masked.get(name) is not None + stats_masked.pop(name) assert stats_masked == stats # Single stat From 8aebf0bfb01df3b21c11b304213d7b6eed815c0e Mon Sep 17 00:00:00 2001 From: vschaffn Date: Wed, 26 Feb 2025 14:16:23 +0100 Subject: [PATCH 14/14] fix: refactor stats --- doc/source/raster_class.md | 14 +++++---- geoutils/raster/raster.py | 50 ++++++++++++++++++++------------ tests/test_raster/test_raster.py | 10 +++++-- 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index f0227be6..ef0d0e28 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -372,13 +372,17 @@ Supported statistics are : - **LE90 (Linear Error with 90% confidence):** Difference between the 95th and 5th percentiles of a dataset, representing the range within which 90% of the data points lie. Ignore masked values. - **NMAD (Normalized Median Absolute Deviation):** robust measure of variability in the data, less sensitive to outliers compared to standard deviation. Ignore masked values. - **RMSE (Root Mean Square Error):** commonly used to express the magnitude of errors or variability and can give insight into the spread of the data. Only relevant when the raster represents a difference of two objects. Ignore masked values. -- **Std (Standard deviation):** Measures the spread or dispersion of the data around the mean, ignoring masked values. -- **Valid count:** The number of valid (unmasked) data points in the array. It counts the non-masked elements. -- **Size:** Total size of the raster. +- **Std (Standard deviation):** measures the spread or dispersion of the data around the mean, ignoring masked values. +- **Valid count:** number of finite data points in the array. It counts the non-masked elements. +- **Total count:** total size of the raster. +- **Percentage valid points:** ratio between **Valid count** and **Total count**. + If an inlier mask is passed: -- **Total count:** The number of unmasked data points in the array before applying the inlier mask. -- **Percentage valid points:** The ration between `valid_count` and `total_count` Useful for classification statistics. +- **Total inlier count:** number of data points in the inlier mask. +- **Valid inlier count:** number of unmasked data points in the array after applying the inlier mask. +- **Percentage inlier points:** ratio between **Valid inlier count** and **Valid count**. Useful for classification statistics. +- **Percentage valid inlier points:** ratio between **Valid inlier count** and **Total inlier count**. Callable functions are supported as well. diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 9fc67363..2746d673 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1890,11 +1890,12 @@ def set_mask(self, mask: NDArrayBool | Mask) -> None: else: self.data[mask_arr > 0] = np.ma.masked - def _statistics(self, band: int = 1, total_count: int | None = None) -> dict[str, np.floating[Any]]: + def _statistics(self, band: int = 1, counts: tuple[int, int] | None = None) -> dict[str, np.floating[Any]]: """ Calculate common statistics for a specified band in the raster. :param band: The index of the band for which to compute statistics. Default is 1. + :param counts: (number of finite data points in the array, number of valid points in inlier_mask). :returns: A dictionary containing the calculated statistics for the selected band. """ @@ -1906,7 +1907,7 @@ def _statistics(self, band: int = 1, total_count: int | None = None) -> dict[str # Compute the statistics mdata = np.ma.filled(data.astype(float), np.nan) - valid_count = np.count_nonzero(~self.get_mask()) + valid_count = np.count_nonzero(~self.get_mask()) if counts is None else counts[0] stats_dict = { "Mean": np.ma.mean(data), "Median": np.ma.median(data), @@ -1920,14 +1921,18 @@ def _statistics(self, band: int = 1, total_count: int | None = None) -> dict[str "RMSE": np.sqrt(np.ma.mean(np.square(data))), "Standard deviation": np.ma.std(data), "Valid count": valid_count, - "Size": data.size, + "Total count": data.size, + "Percentage valid points": (valid_count / data.size) * 100, } - if total_count is not None: + if counts is not None: + valid_inlier_count = np.count_nonzero(~self.get_mask()) stats_dict.update( { - "Total count": total_count, - "Percentage valid points": (valid_count / total_count) * 100, + "Valid inlier count": valid_inlier_count, + "Total inlier count": counts[1], + "Percentage inlier points": (valid_inlier_count / counts[0]) * 100, + "Percentage valid inlier points": (valid_inlier_count / counts[1]) * 100, } ) @@ -1939,7 +1944,7 @@ def get_stats( stats_name: str | Callable[[NDArrayNum], np.floating[Any]], inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, - total_count: int | None = None, + counts: tuple[int, int] | None = None, ) -> np.floating[Any]: ... @overload @@ -1948,7 +1953,7 @@ def get_stats( stats_name: list[str | Callable[[NDArrayNum], np.floating[Any]]] | None = None, inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, - total_count: int | None = None, + counts: tuple[int, int] | None = None, ) -> dict[str, np.floating[Any]]: ... def get_stats( @@ -1958,7 +1963,7 @@ def get_stats( ) = None, inlier_mask: Mask | NDArrayBool | None = None, band: int = 1, - total_count: int | None = None, + counts: tuple[int, int] | None = None, ) -> np.floating[Any] | dict[str, np.floating[Any]]: """ Retrieve specified statistics or all available statistics for the raster data. Allows passing custom callables @@ -1967,22 +1972,26 @@ def get_stats( :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned. Accepted names include: `mean`, `median`, `max`, `min`, `sum`, `sum of squares`, `90th percentile`, `LE90`, `nmad`, `rmse`, - `std`, `valid count`, `size`, and if an inlier mask is passed : `total count`, - `percentage valid points`. + `std`, `valid count`, `total count`, `percentage valid points` and if an inlier mask is passed : + `valid inlier count`, `total inlier count`, `percentage inlier point`, `percentage valid inlier points`. Custom callables can also be provided. :param inlier_mask: A boolean mask to filter values for statistical calculations. :param band: The index of the band for which to compute statistics. Default is 1. - :param total_count: The total number of finite data points in the array. + :param counts: (number of finite data points in the array, number of valid points in inlier_mask). DO NOT USE. :returns: The requested statistic or a dictionary of statistics if multiple or all are requested. """ if not self.is_loaded: self.load() if inlier_mask is not None: - total_count = np.count_nonzero(~self.get_mask()) + valid_points = np.count_nonzero(~self.get_mask()) + if isinstance(inlier_mask, Mask): + inlier_points = np.count_nonzero(~inlier_mask.data) + else: + inlier_points = np.count_nonzero(~inlier_mask) dem_masked = self.copy() dem_masked.set_mask(inlier_mask) - return dem_masked.get_stats(stats_name=stats_name, band=band, total_count=total_count) - stats_dict = self._statistics(band=band, total_count=total_count) + return dem_masked.get_stats(stats_name=stats_name, band=band, counts=(valid_points, inlier_points)) + stats_dict = self._statistics(band=band, counts=counts) if stats_name is None: return stats_dict @@ -2006,13 +2015,16 @@ def get_stats( "std": "Standard deviation", "standarddeviation": "Standard deviation", "validcount": "Valid count", - "size": "Size", + "totalcount": "Total count", + "percentagevalidpoints": "Percentage valid points", } - if total_count is not None: + if counts is not None: stats_aliases.update( { - "totalcount": "Total count", - "percentagevalidpoints": "Percentage valid points", + "validinliercount": "Valid inlier count", + "totalinliercount": "Total inlier count", + "percentagevalidinlierpoints": "Percentage valid inlier points", + "percentageinlierpoints": "Percentage inlier points", } ) diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index 963f6e9b..84d148f6 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -1998,7 +1998,8 @@ def test_stats(self, example: str, caplog) -> None: "RMSE", "Standard deviation", "Valid count", - "Size", + "Total count", + "Percentage valid points", ] # Full stats @@ -2010,7 +2011,12 @@ def test_stats(self, example: str, caplog) -> None: # With mask inlier_mask = raster.get_mask() stats_masked = raster.get_stats(inlier_mask=inlier_mask) - for name in ["Total count", "Percentage valid points"]: + for name in [ + "Valid inlier count", + "Total inlier count", + "Percentage inlier points", + "Percentage valid inlier points", + ]: assert name in stats_masked assert stats_masked.get(name) is not None stats_masked.pop(name)