From 9b80c2f3d10915ffa33738aa252fa9c4e3914183 Mon Sep 17 00:00:00 2001 From: Josiah Parry Date: Wed, 14 Feb 2024 17:08:51 -0500 Subject: [PATCH 1/9] draft significance.py --- esda/significance.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 esda/significance.py diff --git a/esda/significance.py b/esda/significance.py new file mode 100644 index 00000000..aa6a19f2 --- /dev/null +++ b/esda/significance.py @@ -0,0 +1,32 @@ +import numpy as np + +def calculate_significance(test_stat, reference_distribution, method='two-sided'): + """ + Calculate a pseudo p-value from a reference distribution. + + Pseudo-p values are calculated using the formula (M + 1) / (R + 1). Where R is the number of simulations and M is the number of times that the simulated value was equal to, or more extreme than the observed test statistic. + + Simulated test statistics are generated through a process of conditional permutation. Conditional permutation holds fixed the value of Xi and values of neighbors are randomly sampled from X removing Xi simulating spatial randomness. This process is repeated R times to generate a reference distribution from which the pseudo-p value is calculated. + + Parameters + ---------- + test_stat: + The observed test statistic + reference_distribution: + A one-dimensional numpy array containing simulated test statistics as a result of conditional permutation. + method: + One of 'two-sided', 'lesser', or 'greater'. Indicates the alternative hypothesis. + - 'two-sided': the observed test-statistic is more-extreme than expected under the assumption of complete spatial randomness. + - 'lesser': the observed test-statistic is less than the expected value under the assumption of complete spatial randomness. + - 'greater': the observed test-statistic is greater than the exepcted value under the assumption of complete spatial randomness. + + """ + if method == 'two-sided': + p_value = np.sum(reference_distribution >= np.abs(test_stat)) / (len(reference_distribution) + 1) + elif method == 'lesser': + p_value = (np.sum(reference_distribution >= test_stat) + 1) / (len(reference_distribution) + 1) + elif method == 'greater': + p_value = (np.sum(reference_distribution <= test_stat) + 1) / (len(reference_distribution) + 1) + else: + raise ValueError(f"Unknown method {method}") + return p_value \ No newline at end of file From 5e6b05b67de9b1547e9b1ae823bee8ca94f6d560 Mon Sep 17 00:00:00 2001 From: Josiah Parry Date: Wed, 14 Feb 2024 17:13:28 -0500 Subject: [PATCH 2/9] multiply two-sided by 2 --- esda/significance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esda/significance.py b/esda/significance.py index aa6a19f2..70f5bfd8 100644 --- a/esda/significance.py +++ b/esda/significance.py @@ -22,7 +22,7 @@ def calculate_significance(test_stat, reference_distribution, method='two-sided' """ if method == 'two-sided': - p_value = np.sum(reference_distribution >= np.abs(test_stat)) / (len(reference_distribution) + 1) + p_value = 2 * (np.sum(reference_distribution >= np.abs(test_stat)) / (len(reference_distribution) + 1)) elif method == 'lesser': p_value = (np.sum(reference_distribution >= test_stat) + 1) / (len(reference_distribution) + 1) elif method == 'greater': From 1783ae92bd8f1b36ced7e536f33f4edb2dff3395 Mon Sep 17 00:00:00 2001 From: ljwolf Date: Thu, 15 Feb 2024 10:18:54 +0000 Subject: [PATCH 3/9] add the two-sided percentile-based test and directed test with array shaping --- esda/significance.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/esda/significance.py b/esda/significance.py index 70f5bfd8..5ca83a01 100644 --- a/esda/significance.py +++ b/esda/significance.py @@ -11,22 +11,36 @@ def calculate_significance(test_stat, reference_distribution, method='two-sided' Parameters ---------- test_stat: - The observed test statistic + The observed test statistic, or a vector of observed test statistics reference_distribution: - A one-dimensional numpy array containing simulated test statistics as a result of conditional permutation. + A numpy array containing simulated test statistics as a result of conditional permutation. method: One of 'two-sided', 'lesser', or 'greater'. Indicates the alternative hypothesis. - 'two-sided': the observed test-statistic is more-extreme than expected under the assumption of complete spatial randomness. - 'lesser': the observed test-statistic is less than the expected value under the assumption of complete spatial randomness. - 'greater': the observed test-statistic is greater than the exepcted value under the assumption of complete spatial randomness. - + - 'directed': run both lesser and greater tests, then pick the smaller p-value. """ - if method == 'two-sided': - p_value = 2 * (np.sum(reference_distribution >= np.abs(test_stat)) / (len(reference_distribution) + 1)) + reference_distribution = numpy.atleast_2d(reference_distribution) + n_samples,p_permutations = reference_distribution.shape + test_stat = numpy.atleast_2d(test_stat).reshape(n_samples, -1) + if method == "directed": + larger = (reference_distribution >= test_stat).sum(axis=1) + low_extreme = (p_permutations - larger) < larger + larger[low_extreme] = p_permutations - larger[low_extreme] + p_value = (larger + 1.0) / (p_permutations + 1.0) elif method == 'lesser': - p_value = (np.sum(reference_distribution >= test_stat) + 1) / (len(reference_distribution) + 1) + p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / (len(reference_distribution) + 1) elif method == 'greater': - p_value = (np.sum(reference_distribution <= test_stat) + 1) / (len(reference_distribution) + 1) + p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / (len(reference_distribution) + 1) + elif method == "two-sided": + percentile = (reference_distribution < test_stat).mean(axis=1) + bounds = np.column_stack((1-percentile, percentile)) * 100 + bounds.sort(axis=1) + lows, highs = np.row_stack([stats.scoreatpercentile(r, per=p) for r,p in zip(reference_distribution, bounds)]).T + n_outside = (reference_distribution <= lows[:,None]).sum(axis=1) + n_outside += (reference_distribution >= highs[:,None]).sum(axis=1) + 1 + p_value = (n_outside+1) / (p_permutations+1) else: - raise ValueError(f"Unknown method {method}") + raise ValueError(f"Unknown p-value method: {method}. Generally, 'two-sided' is a good default!") return p_value \ No newline at end of file From 65552caa90b9c237bca8ff6142ed471ef1933012 Mon Sep 17 00:00:00 2001 From: ljwolf Date: Thu, 15 Feb 2024 10:25:47 +0000 Subject: [PATCH 4/9] fix imports --- esda/significance.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/esda/significance.py b/esda/significance.py index 5ca83a01..1975364a 100644 --- a/esda/significance.py +++ b/esda/significance.py @@ -1,4 +1,5 @@ import numpy as np +from scipy import stats def calculate_significance(test_stat, reference_distribution, method='two-sided'): """ @@ -21,9 +22,9 @@ def calculate_significance(test_stat, reference_distribution, method='two-sided' - 'greater': the observed test-statistic is greater than the exepcted value under the assumption of complete spatial randomness. - 'directed': run both lesser and greater tests, then pick the smaller p-value. """ - reference_distribution = numpy.atleast_2d(reference_distribution) + reference_distribution = np.atleast_2d(reference_distribution) n_samples,p_permutations = reference_distribution.shape - test_stat = numpy.atleast_2d(test_stat).reshape(n_samples, -1) + test_stat = np.atleast_2d(test_stat).reshape(n_samples, -1) if method == "directed": larger = (reference_distribution >= test_stat).sum(axis=1) low_extreme = (p_permutations - larger) < larger From 920719c66d3761b104c7c60cf76c388c7d09d15f Mon Sep 17 00:00:00 2001 From: ljwolf Date: Thu, 15 Feb 2024 10:30:49 +0000 Subject: [PATCH 5/9] add folding-based p-value --- esda/significance.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/esda/significance.py b/esda/significance.py index 1975364a..0c1e441c 100644 --- a/esda/significance.py +++ b/esda/significance.py @@ -31,9 +31,9 @@ def calculate_significance(test_stat, reference_distribution, method='two-sided' larger[low_extreme] = p_permutations - larger[low_extreme] p_value = (larger + 1.0) / (p_permutations + 1.0) elif method == 'lesser': - p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / (len(reference_distribution) + 1) + p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / (p_permutations + 1) elif method == 'greater': - p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / (len(reference_distribution) + 1) + p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / (p_permutations + 1) elif method == "two-sided": percentile = (reference_distribution < test_stat).mean(axis=1) bounds = np.column_stack((1-percentile, percentile)) * 100 @@ -42,6 +42,11 @@ def calculate_significance(test_stat, reference_distribution, method='two-sided' n_outside = (reference_distribution <= lows[:,None]).sum(axis=1) n_outside += (reference_distribution >= highs[:,None]).sum(axis=1) + 1 p_value = (n_outside+1) / (p_permutations+1) + elif method == "folded": + means = reference_distribution.mean(axis=1, keepdims=True) + test_stat = np.abs(test_stat - means) + reference_distribution = np.abs(reference_distribution - means) + p_value = ((reference_distribution>=test_stat).sum(axis=1) + 1) / (p_permutations + 1) else: raise ValueError(f"Unknown p-value method: {method}. Generally, 'two-sided' is a good default!") return p_value \ No newline at end of file From d9ea095bf494a08f8810be0de9ff4cddb4235a2d Mon Sep 17 00:00:00 2001 From: ljwolf Date: Thu, 15 Feb 2024 10:42:15 +0000 Subject: [PATCH 6/9] swap to strict inequality --- esda/significance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/esda/significance.py b/esda/significance.py index 0c1e441c..894e44d3 100644 --- a/esda/significance.py +++ b/esda/significance.py @@ -39,14 +39,14 @@ def calculate_significance(test_stat, reference_distribution, method='two-sided' bounds = np.column_stack((1-percentile, percentile)) * 100 bounds.sort(axis=1) lows, highs = np.row_stack([stats.scoreatpercentile(r, per=p) for r,p in zip(reference_distribution, bounds)]).T - n_outside = (reference_distribution <= lows[:,None]).sum(axis=1) - n_outside += (reference_distribution >= highs[:,None]).sum(axis=1) + 1 + n_outside = (reference_distribution < lows[:,None]).sum(axis=1) + n_outside += (reference_distribution > highs[:,None]).sum(axis=1) + 1 p_value = (n_outside+1) / (p_permutations+1) elif method == "folded": means = reference_distribution.mean(axis=1, keepdims=True) test_stat = np.abs(test_stat - means) reference_distribution = np.abs(reference_distribution - means) - p_value = ((reference_distribution>=test_stat).sum(axis=1) + 1) / (p_permutations + 1) + p_value = ((reference_distribution >= test_stat).sum(axis=1) + 1) / (p_permutations + 1) else: raise ValueError(f"Unknown p-value method: {method}. Generally, 'two-sided' is a good default!") return p_value \ No newline at end of file From c6e3a8ae821cde40fdcd4a15cb1f615db2f748e6 Mon Sep 17 00:00:00 2001 From: ljwolf Date: Thu, 15 Feb 2024 10:43:59 +0000 Subject: [PATCH 7/9] add example and ruff --- esda/significance.py | 87 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/esda/significance.py b/esda/significance.py index 894e44d3..60cb6d4d 100644 --- a/esda/significance.py +++ b/esda/significance.py @@ -1,11 +1,12 @@ import numpy as np from scipy import stats -def calculate_significance(test_stat, reference_distribution, method='two-sided'): + +def calculate_significance(test_stat, reference_distribution, method="two-sided"): """ - Calculate a pseudo p-value from a reference distribution. + Calculate a pseudo p-value from a reference distribution. - Pseudo-p values are calculated using the formula (M + 1) / (R + 1). Where R is the number of simulations and M is the number of times that the simulated value was equal to, or more extreme than the observed test statistic. + Pseudo-p values are calculated using the formula (M + 1) / (R + 1). Where R is the number of simulations and M is the number of times that the simulated value was equal to, or more extreme than the observed test statistic. Simulated test statistics are generated through a process of conditional permutation. Conditional permutation holds fixed the value of Xi and values of neighbors are randomly sampled from X removing Xi simulating spatial randomness. This process is repeated R times to generate a reference distribution from which the pseudo-p value is calculated. @@ -15,38 +16,86 @@ def calculate_significance(test_stat, reference_distribution, method='two-sided' The observed test statistic, or a vector of observed test statistics reference_distribution: A numpy array containing simulated test statistics as a result of conditional permutation. - method: + method: One of 'two-sided', 'lesser', or 'greater'. Indicates the alternative hypothesis. - 'two-sided': the observed test-statistic is more-extreme than expected under the assumption of complete spatial randomness. - 'lesser': the observed test-statistic is less than the expected value under the assumption of complete spatial randomness. - - 'greater': the observed test-statistic is greater than the exepcted value under the assumption of complete spatial randomness. - - 'directed': run both lesser and greater tests, then pick the smaller p-value. + - 'greater': the observed test-statistic is greater than the exepcted value under the assumption of complete spatial randomness. + - 'directed': run both lesser and greater tests, then pick the smaller p-value. """ reference_distribution = np.atleast_2d(reference_distribution) - n_samples,p_permutations = reference_distribution.shape + n_samples, p_permutations = reference_distribution.shape test_stat = np.atleast_2d(test_stat).reshape(n_samples, -1) if method == "directed": larger = (reference_distribution >= test_stat).sum(axis=1) low_extreme = (p_permutations - larger) < larger larger[low_extreme] = p_permutations - larger[low_extreme] p_value = (larger + 1.0) / (p_permutations + 1.0) - elif method == 'lesser': - p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / (p_permutations + 1) - elif method == 'greater': - p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / (p_permutations + 1) + elif method == "lesser": + p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / ( + p_permutations + 1 + ) + elif method == "greater": + p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / ( + p_permutations + 1 + ) elif method == "two-sided": percentile = (reference_distribution < test_stat).mean(axis=1) - bounds = np.column_stack((1-percentile, percentile)) * 100 + bounds = np.column_stack((1 - percentile, percentile)) * 100 bounds.sort(axis=1) - lows, highs = np.row_stack([stats.scoreatpercentile(r, per=p) for r,p in zip(reference_distribution, bounds)]).T - n_outside = (reference_distribution < lows[:,None]).sum(axis=1) - n_outside += (reference_distribution > highs[:,None]).sum(axis=1) + 1 - p_value = (n_outside+1) / (p_permutations+1) + lows, highs = np.row_stack( + [ + stats.scoreatpercentile(r, per=p) + for r, p in zip(reference_distribution, bounds) + ] + ).T + n_outside = (reference_distribution < lows[:, None]).sum(axis=1) + n_outside += (reference_distribution > highs[:, None]).sum(axis=1) + 1 + p_value = (n_outside + 1) / (p_permutations + 1) elif method == "folded": means = reference_distribution.mean(axis=1, keepdims=True) test_stat = np.abs(test_stat - means) reference_distribution = np.abs(reference_distribution - means) - p_value = ((reference_distribution >= test_stat).sum(axis=1) + 1) / (p_permutations + 1) + p_value = ((reference_distribution >= test_stat).sum(axis=1) + 1) / ( + p_permutations + 1 + ) else: - raise ValueError(f"Unknown p-value method: {method}. Generally, 'two-sided' is a good default!") - return p_value \ No newline at end of file + raise ValueError( + f"Unknown p-value method: {method}. Generally, 'two-sided' is a good default!" + ) + return p_value + + +if __name__ == "__main__": + import numpy + import esda + import pandas + from libpysal.weights import Voronoi + from esda.significance import calculate_significance + + coordinates = numpy.random.random(size=(2000, 2)) + x = numpy.random.normal(size=(2000,)) + w = Voronoi(coordinates, clip="bbox") + w.transform = "r" + stat = esda.Moran_Local(x, w) + + ts = calculate_significance(stat.Is, stat.rlisas, method="two-sided") + di = calculate_significance(stat.Is, stat.rlisas, method="directed") + lt = calculate_significance(stat.Is, stat.rlisas, method="lesser") + gt = calculate_significance(stat.Is, stat.rlisas, method="greater") + fo = calculate_significance(stat.Is, stat.rlisas, method="folded") + + numpy.testing.assert_array_equal( + numpy.minimum(lt, gt), di + ) # di is just the minimum of the two tests + + print( + f"directed * 2 is the same as two-sided {(di*2 == ts).mean()*100}% of the time" + ) + + print( + pandas.DataFrame( + numpy.column_stack((ts, di, fo, lt, gt)), + columns=["two-sided", "directed", "folded", "lt", "gt"], + ).corr() + ) From fa2deaab56aea62a5fd5dc5750990c250d080d4e Mon Sep 17 00:00:00 2001 From: ljwolf Date: Fri, 23 Feb 2024 16:17:54 +0000 Subject: [PATCH 8/9] update significance directions --- esda/significance.py | 52 +++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/esda/significance.py b/esda/significance.py index 60cb6d4d..3ca10f72 100644 --- a/esda/significance.py +++ b/esda/significance.py @@ -6,22 +6,27 @@ def calculate_significance(test_stat, reference_distribution, method="two-sided" """ Calculate a pseudo p-value from a reference distribution. - Pseudo-p values are calculated using the formula (M + 1) / (R + 1). Where R is the number of simulations and M is the number of times that the simulated value was equal to, or more extreme than the observed test statistic. - - Simulated test statistics are generated through a process of conditional permutation. Conditional permutation holds fixed the value of Xi and values of neighbors are randomly sampled from X removing Xi simulating spatial randomness. This process is repeated R times to generate a reference distribution from which the pseudo-p value is calculated. + Pseudo-p values are calculated using the formula (M + 1) / (R + 1). Where R is the number of simulations + and M is the number of times that the simulated value was equal to, or more extreme than the observed test statistic. Parameters ---------- - test_stat: + test_stat: float or numpy.ndarray The observed test statistic, or a vector of observed test statistics - reference_distribution: + reference_distribution: numpy.ndarray A numpy array containing simulated test statistics as a result of conditional permutation. - method: + method: string One of 'two-sided', 'lesser', or 'greater'. Indicates the alternative hypothesis. - - 'two-sided': the observed test-statistic is more-extreme than expected under the assumption of complete spatial randomness. - - 'lesser': the observed test-statistic is less than the expected value under the assumption of complete spatial randomness. - - 'greater': the observed test-statistic is greater than the exepcted value under the assumption of complete spatial randomness. - - 'directed': run both lesser and greater tests, then pick the smaller p-value. + - 'two-sided': the observed test-statistic is an extreme value of the reference distribution. + - 'lesser': the observed test-statistic is small relative to the reference distribution. + - 'greater': the observed test-statistic is large relative to the reference distribution. + - 'directed': the observed test statistic is either small or large reltaive to the reference distribution. + + Notes + ----- + + the directed p-value is half of the two-sided p-value, and corresponds to running the + lesser and greater tests, then picking the smaller significance value. This is not advised. """ reference_distribution = np.atleast_2d(reference_distribution) n_samples, p_permutations = reference_distribution.shape @@ -32,11 +37,11 @@ def calculate_significance(test_stat, reference_distribution, method="two-sided" larger[low_extreme] = p_permutations - larger[low_extreme] p_value = (larger + 1.0) / (p_permutations + 1.0) elif method == "lesser": - p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / ( + p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / ( p_permutations + 1 ) elif method == "greater": - p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / ( + p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / ( p_permutations + 1 ) elif method == "two-sided": @@ -71,7 +76,6 @@ def calculate_significance(test_stat, reference_distribution, method="two-sided" import esda import pandas from libpysal.weights import Voronoi - from esda.significance import calculate_significance coordinates = numpy.random.random(size=(2000, 2)) x = numpy.random.normal(size=(2000,)) @@ -99,3 +103,25 @@ def calculate_significance(test_stat, reference_distribution, method="two-sided" columns=["two-sided", "directed", "folded", "lt", "gt"], ).corr() ) + + answer = input("run big simulation? [y/n]") + if answer.lower().startswith("y"): + all_correlations = [] + for i in range(1000): + x = numpy.random.normal(size=(2000,)) + stat = esda.Moran_Local(x, w) + ts = calculate_significance(stat.Is, stat.rlisas, method="two-sided") + di = calculate_significance(stat.Is, stat.rlisas, method="directed") + lt = calculate_significance(stat.Is, stat.rlisas, method="lesser") + gt = calculate_significance(stat.Is, stat.rlisas, method="greater") + fo = calculate_significance(stat.Is, stat.rlisas, method="folded") + corrs = ( + pandas.DataFrame( + numpy.column_stack((ts, di, fo, lt, gt)), + columns=["two-sided", "directed", "folded", "lt", "gt"], + ) + .corr() + .assign(repno=i) + ) + all_correlations.append(corrs) + all_correlations = pandas.concat(all_correlations) From 1c59fa7c62c67bdea70e3eeac1bd6786dd0fd04c Mon Sep 17 00:00:00 2001 From: Levi Wolf Date: Wed, 6 Mar 2024 12:30:53 +0000 Subject: [PATCH 9/9] adding one below is sufficient --- esda/significance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esda/significance.py b/esda/significance.py index 3ca10f72..a01adf07 100644 --- a/esda/significance.py +++ b/esda/significance.py @@ -55,7 +55,7 @@ def calculate_significance(test_stat, reference_distribution, method="two-sided" ] ).T n_outside = (reference_distribution < lows[:, None]).sum(axis=1) - n_outside += (reference_distribution > highs[:, None]).sum(axis=1) + 1 + n_outside += (reference_distribution > highs[:, None]).sum(axis=1) p_value = (n_outside + 1) / (p_permutations + 1) elif method == "folded": means = reference_distribution.mean(axis=1, keepdims=True)