From a4c0c94d62a9acb48736d828ffe46a02b42e7a93 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 26 Jul 2024 16:41:45 -0600 Subject: [PATCH 1/4] Improve precision for mean, std, var. np.bincount always accumulates to float64. So only cast after the division. --- numpy_groupies/aggregate_numpy.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/numpy_groupies/aggregate_numpy.py b/numpy_groupies/aggregate_numpy.py index 844f2f2..f971176 100644 --- a/numpy_groupies/aggregate_numpy.py +++ b/numpy_groupies/aggregate_numpy.py @@ -149,15 +149,13 @@ def _mean(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): sums.real = np.bincount(group_idx, weights=a.real, minlength=size) sums.imag = np.bincount(group_idx, weights=a.imag, minlength=size) else: - sums = np.bincount(group_idx, weights=a, minlength=size).astype( - dtype, copy=False - ) + sums = np.bincount(group_idx, weights=a, minlength=size) with np.errstate(divide="ignore", invalid="ignore"): - ret = sums.astype(dtype, copy=False) / counts + ret = sums / counts if not np.isnan(fill_value): ret[counts == 0] = fill_value - return ret + return ret.astype(dtype, copy=False) def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): @@ -165,7 +163,7 @@ def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): if fill_value != 0: counts = np.bincount(group_idx, minlength=size) ret[counts == 0] = fill_value - return ret + return ret.astype(dtype, copy=False) def _var( @@ -176,7 +174,7 @@ def _var( counts = np.bincount(group_idx, minlength=size) sums = np.bincount(group_idx, weights=a, minlength=size) with np.errstate(divide="ignore", invalid="ignore"): - means = sums.astype(dtype, copy=False) / counts + means = sums / counts counts = np.where(counts > ddof, counts - ddof, 0) ret = ( np.bincount(group_idx, (a - means[group_idx]) ** 2, minlength=size) / counts @@ -185,7 +183,7 @@ def _var( ret = np.sqrt(ret) # this is now std not var if not np.isnan(fill_value): ret[counts == 0] = fill_value - return ret + return ret.astype(dtype, copy=False) def _std(group_idx, a, size, fill_value, dtype=np.dtype(np.float64), ddof=0): From fee245f15c8587e3bcf13869640b96206af80669 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 28 Jul 2024 14:09:45 -0600 Subject: [PATCH 2/4] Fix cumsum --- numpy_groupies/aggregate_numpy.py | 5 ++++- numpy_groupies/tests/test_generic.py | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/numpy_groupies/aggregate_numpy.py b/numpy_groupies/aggregate_numpy.py index f971176..1eff227 100644 --- a/numpy_groupies/aggregate_numpy.py +++ b/numpy_groupies/aggregate_numpy.py @@ -250,7 +250,10 @@ def _cumsum(group_idx, a, size, fill_value=None, dtype=None): increasing = np.arange(len(a), dtype=int) group_starts = _min(group_idx_srt, increasing, size, fill_value=0)[group_idx_srt] - a_srt_cumsum += -a_srt_cumsum[group_starts] + a_srt[group_starts] + # First subtract large numbers + a_srt_cumsum -= a_srt_cumsum[group_starts] + # Then add potentially small numbers + a_srt_cumsum += a_srt[group_starts] return a_srt_cumsum[invsortidx] diff --git a/numpy_groupies/tests/test_generic.py b/numpy_groupies/tests/test_generic.py index 2a2811d..7a8b772 100644 --- a/numpy_groupies/tests/test_generic.py +++ b/numpy_groupies/tests/test_generic.py @@ -570,3 +570,14 @@ def test_var_with_nan_fill_value(aggregate_all, ddof, nan_inds, func): group_idx, a, axis=-1, fill_value=np.nan, func=func, ddof=ddof ) np.testing.assert_equal(actual, expected) + + +def test_cumsum_accuracy(aggregate_all): + array = np.array( + [0.00000000e00, 0.00000000e00, 0.00000000e00, 3.27680000e04, 9.99999975e-06] + ) + group_idx = np.array([0, 0, 0, 0, 1]) + + actual = aggregate_all(group_idx, array, axis=-1, func="cumsum") + expected = array + np.testing.assert_allclose(actual, expected) From 248a5ec64780871af32a191aba47d91f85dd1ed4 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 28 Jul 2024 14:12:16 -0600 Subject: [PATCH 3/4] edit --- numpy_groupies/aggregate_numpy.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/numpy_groupies/aggregate_numpy.py b/numpy_groupies/aggregate_numpy.py index 1eff227..12d88fe 100644 --- a/numpy_groupies/aggregate_numpy.py +++ b/numpy_groupies/aggregate_numpy.py @@ -155,7 +155,10 @@ def _mean(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): ret = sums / counts if not np.isnan(fill_value): ret[counts == 0] = fill_value - return ret.astype(dtype, copy=False) + if iscomplexobj(a): + return ret + else: + ret.astype(dtype, copy=False) def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): @@ -163,7 +166,10 @@ def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): if fill_value != 0: counts = np.bincount(group_idx, minlength=size) ret[counts == 0] = fill_value - return ret.astype(dtype, copy=False) + if iscomplexobj(a): + return ret + else: + ret.astype(dtype, copy=False) def _var( @@ -183,7 +189,10 @@ def _var( ret = np.sqrt(ret) # this is now std not var if not np.isnan(fill_value): ret[counts == 0] = fill_value - return ret.astype(dtype, copy=False) + if iscomplexobj(a): + return ret + else: + ret.astype(dtype, copy=False) def _std(group_idx, a, size, fill_value, dtype=np.dtype(np.float64), ddof=0): From dbe87c2f4cfeff2123cd35c9faa4edd507448936 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 28 Jul 2024 17:02:32 -0600 Subject: [PATCH 4/4] Fix --- numpy_groupies/aggregate_numpy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numpy_groupies/aggregate_numpy.py b/numpy_groupies/aggregate_numpy.py index 12d88fe..dfbf11e 100644 --- a/numpy_groupies/aggregate_numpy.py +++ b/numpy_groupies/aggregate_numpy.py @@ -158,7 +158,7 @@ def _mean(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): if iscomplexobj(a): return ret else: - ret.astype(dtype, copy=False) + return ret.astype(dtype, copy=False) def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): @@ -169,7 +169,7 @@ def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): if iscomplexobj(a): return ret else: - ret.astype(dtype, copy=False) + return ret.astype(dtype, copy=False) def _var( @@ -192,7 +192,7 @@ def _var( if iscomplexobj(a): return ret else: - ret.astype(dtype, copy=False) + return ret.astype(dtype, copy=False) def _std(group_idx, a, size, fill_value, dtype=np.dtype(np.float64), ddof=0):