From 6e9d7671d6acd9798fca9a06f7d0629318141293 Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Mon, 16 Sep 2024 15:31:11 +0200 Subject: [PATCH 01/13] cdf testing prototype --- rand_distr/tests/cdf.rs | 132 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 rand_distr/tests/cdf.rs diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs new file mode 100644 index 0000000000..32284c81ad --- /dev/null +++ b/rand_distr/tests/cdf.rs @@ -0,0 +1,132 @@ +// Copyright 2021 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use rand::SeedableRng; +use rand_distr::{Distribution, Normal}; +use statrs::distribution::ContinuousCDF; +use statrs::distribution::DiscreteCDF; + +/// Empirical Cumulative Distribution Function (ECDF) +struct ECDF { + sorted_samples: Vec, +} + +impl ECDF { + fn new(mut samples: Vec) -> Self { + samples.sort_by(|a, b| a.partial_cmp(b).unwrap()); + Self { + sorted_samples: samples, + } + } + + /// Returns the step points of the ECDF + /// The ECDF is a step function that increases by 1/n at each sample point + /// The function is continoius from the right, so we give the bigger value at the step points + /// First point is (-inf, 0.0), last point is (max(samples), 1.0) + fn step_points(&self) -> Vec<(f64, f64)> { + let mut points = Vec::with_capacity(self.sorted_samples.len() + 1); + let mut last = f64::NEG_INFINITY; + let mut count = 0; + let n = self.sorted_samples.len() as f64; + for &x in &self.sorted_samples { + if x != last { + points.push((last, count as f64 / n)); + last = x; + } + count += 1; + } + points.push((last, count as f64 / n)); + points + } +} + +fn kolmo_smirnov_statistic_continuous(ecdf: ECDF, cdf: impl Fn(f64) -> f64) -> f64 { + let mut max_diff = 0.; + // The maximum will always be at a step point, because the cdf is continious monotonic increasing + let step_points = ecdf.step_points(); + for i in 0..step_points.len() - 1 { + // This shift is because we want the value of the ecdf at (x_{i + 1} - epsilon) = ecdf(x_i) and comare it to cdf(x_{i + 1}) + // cdf(x_{i + 1} - epsilon) = cdf(x_{i+1}) because cdf is continious + let x = step_points[i + 1].0; + let diff = (step_points[i].1 - cdf(x)).abs(); + + if diff > max_diff { + max_diff = diff; + } + } + max_diff +} + +fn kolmo_smirnov_statistic_discrete(ecdf: ECDF, cdf: impl Fn(i64) -> f64) -> f64 { + let mut max_diff = 0.; + + // The maximum will always be at a step point, but we have to be careful because the cdf is not continious + // It is actually easier because both are right continious step functions + let step_points = ecdf.step_points(); + for i in 1..step_points.len() { + let x = step_points[i].0; + let diff = (step_points[i].1 - cdf(x as i64)).abs(); + if diff > max_diff { + max_diff = diff; + } + } + max_diff +} +#[test] +fn normal() { + const N_SAMPLES: u64 = 1_000_000; + const MEAN: f64 = 2.; + const STD_DEV: f64 = 50.0; + + let dist = Normal::new(MEAN, STD_DEV).unwrap(); + let mut rng = rand::rngs::SmallRng::seed_from_u64(1); + let samples = (0..N_SAMPLES).map(|_| dist.sample(&mut rng)).collect(); + let ecdf = ECDF::new(samples); + + let cdf = |x| { + statrs::distribution::Normal::new(MEAN, STD_DEV) + .unwrap() + .cdf(x) + }; + + let ks_statistic = kolmo_smirnov_statistic_continuous(ecdf, cdf); + + let critical_value = 1.36 / (N_SAMPLES as f64).sqrt(); + + println!("KS statistic: {}", ks_statistic); + println!("Critical value: {}", critical_value); + assert!(ks_statistic < critical_value); +} + +#[test] +fn binomial() { + const N_SAMPLES: u64 = 10_000_000; + const N: u64 = 10000000; + const P: f64 = 0.001; + + let dist = rand_distr::Binomial::new(N, P).unwrap(); + let mut rng = rand::rngs::SmallRng::seed_from_u64(1); + let samples = (0..N_SAMPLES) + .map(|_| dist.sample(&mut rng) as f64) + .collect(); + let ecdf = ECDF::new(samples); + + let cdf = |x| { + statrs::distribution::Binomial::new(P, N) + .unwrap() + .cdf(x as u64) + }; + + let ks_statistic = kolmo_smirnov_statistic_discrete(ecdf, cdf); + + let critical_value = 1.36 / (N_SAMPLES as f64).sqrt(); + + println!("KS statistic: {}", ks_statistic); + println!("Critical value: {}", critical_value); + assert!(ks_statistic < critical_value); +} From 166b94548bfe65b808210f28dd934b52329cd2c7 Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Tue, 17 Sep 2024 17:56:41 +0200 Subject: [PATCH 02/13] abstraction ove the test --- rand_distr/tests/cdf.rs | 57 +++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index 32284c81ad..e1a8b4bf94 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -77,25 +77,17 @@ fn kolmo_smirnov_statistic_discrete(ecdf: ECDF, cdf: impl Fn(i64) -> f64) -> f64 } max_diff } -#[test] -fn normal() { - const N_SAMPLES: u64 = 1_000_000; - const MEAN: f64 = 2.; - const STD_DEV: f64 = 50.0; - let dist = Normal::new(MEAN, STD_DEV).unwrap(); +#[cfg(test)] +fn test_continious(dist: impl Distribution, cdf: impl Fn(f64) -> f64) { + const N_SAMPLES: u64 = 1_000_000; let mut rng = rand::rngs::SmallRng::seed_from_u64(1); let samples = (0..N_SAMPLES).map(|_| dist.sample(&mut rng)).collect(); let ecdf = ECDF::new(samples); - let cdf = |x| { - statrs::distribution::Normal::new(MEAN, STD_DEV) - .unwrap() - .cdf(x) - }; - - let ks_statistic = kolmo_smirnov_statistic_continuous(ecdf, cdf); + let ks_statistic = kolmo_smirnov_statistic_continuous(ecdf, |x| cdf(x)); + // It is a heuristic value, we want to prove that the distributions match, so p values don't help us let critical_value = 1.36 / (N_SAMPLES as f64).sqrt(); println!("KS statistic: {}", ks_statistic); @@ -103,26 +95,19 @@ fn normal() { assert!(ks_statistic < critical_value); } -#[test] -fn binomial() { - const N_SAMPLES: u64 = 10_000_000; - const N: u64 = 10000000; - const P: f64 = 0.001; - - let dist = rand_distr::Binomial::new(N, P).unwrap(); +#[cfg(test)] +fn test_discrete>(dist: impl Distribution, cdf: impl Fn(i64) -> f64) +where + >::Error: std::fmt::Debug, +{ + const N_SAMPLES: u64 = 1_000_000; let mut rng = rand::rngs::SmallRng::seed_from_u64(1); let samples = (0..N_SAMPLES) - .map(|_| dist.sample(&mut rng) as f64) + .map(|_| dist.sample(&mut rng).try_into().unwrap() as f64) .collect(); let ecdf = ECDF::new(samples); - let cdf = |x| { - statrs::distribution::Binomial::new(P, N) - .unwrap() - .cdf(x as u64) - }; - - let ks_statistic = kolmo_smirnov_statistic_discrete(ecdf, cdf); + let ks_statistic = kolmo_smirnov_statistic_discrete(ecdf, |x| cdf(x)); let critical_value = 1.36 / (N_SAMPLES as f64).sqrt(); @@ -130,3 +115,19 @@ fn binomial() { println!("Critical value: {}", critical_value); assert!(ks_statistic < critical_value); } + +#[test] +fn normal() { + test_continious(Normal::new(0.0, 1.0).unwrap(), |x| { + statrs::distribution::Normal::new(0.0, 1.0).unwrap().cdf(x) + }); +} + +#[test] +fn binomial() { + test_discrete(rand_distr::Binomial::new(10, 0.5).unwrap(), |x| { + statrs::distribution::Binomial::new(0.5, 10) + .unwrap() + .cdf(x as u64) + }); +} From e937ef63d6a907ae516763b2d8d4edab9dcb17cb Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Tue, 17 Sep 2024 18:10:24 +0200 Subject: [PATCH 03/13] fix internal dependency on rand_distr --- rand_distr/tests/cdf.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index e1a8b4bf94..63d0a405b7 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -12,11 +12,11 @@ use statrs::distribution::ContinuousCDF; use statrs::distribution::DiscreteCDF; /// Empirical Cumulative Distribution Function (ECDF) -struct ECDF { +struct Ecdf { sorted_samples: Vec, } -impl ECDF { +impl Ecdf { fn new(mut samples: Vec) -> Self { samples.sort_by(|a, b| a.partial_cmp(b).unwrap()); Self { @@ -45,7 +45,7 @@ impl ECDF { } } -fn kolmo_smirnov_statistic_continuous(ecdf: ECDF, cdf: impl Fn(f64) -> f64) -> f64 { +fn kolmo_smirnov_statistic_continuous(ecdf: Ecdf, cdf: impl Fn(f64) -> f64) -> f64 { let mut max_diff = 0.; // The maximum will always be at a step point, because the cdf is continious monotonic increasing let step_points = ecdf.step_points(); @@ -62,15 +62,14 @@ fn kolmo_smirnov_statistic_continuous(ecdf: ECDF, cdf: impl Fn(f64) -> f64) -> f max_diff } -fn kolmo_smirnov_statistic_discrete(ecdf: ECDF, cdf: impl Fn(i64) -> f64) -> f64 { +fn kolmo_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) -> f64 { let mut max_diff = 0.; // The maximum will always be at a step point, but we have to be careful because the cdf is not continious // It is actually easier because both are right continious step functions let step_points = ecdf.step_points(); - for i in 1..step_points.len() { - let x = step_points[i].0; - let diff = (step_points[i].1 - cdf(x as i64)).abs(); + for (x, y) in step_points[1..].iter() { + let diff = (*y - cdf(*x as i64)).abs(); if diff > max_diff { max_diff = diff; } @@ -83,9 +82,9 @@ fn test_continious(dist: impl Distribution, cdf: impl Fn(f64) -> f64) { const N_SAMPLES: u64 = 1_000_000; let mut rng = rand::rngs::SmallRng::seed_from_u64(1); let samples = (0..N_SAMPLES).map(|_| dist.sample(&mut rng)).collect(); - let ecdf = ECDF::new(samples); + let ecdf = Ecdf::new(samples); - let ks_statistic = kolmo_smirnov_statistic_continuous(ecdf, |x| cdf(x)); + let ks_statistic = kolmo_smirnov_statistic_continuous(ecdf, cdf); // It is a heuristic value, we want to prove that the distributions match, so p values don't help us let critical_value = 1.36 / (N_SAMPLES as f64).sqrt(); @@ -105,9 +104,9 @@ where let samples = (0..N_SAMPLES) .map(|_| dist.sample(&mut rng).try_into().unwrap() as f64) .collect(); - let ecdf = ECDF::new(samples); + let ecdf = Ecdf::new(samples); - let ks_statistic = kolmo_smirnov_statistic_discrete(ecdf, |x| cdf(x)); + let ks_statistic = kolmo_smirnov_statistic_discrete(ecdf, cdf); let critical_value = 1.36 / (N_SAMPLES as f64).sqrt(); From 886ca3493dd6526ab9a4386dfad56040ffa83cbb Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Thu, 19 Sep 2024 14:42:56 +0200 Subject: [PATCH 04/13] higher critical value, the old one would fail in 1 of 20 cases even if everything works correctly --- rand_distr/tests/cdf.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index 63d0a405b7..bffa6eb638 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -80,14 +80,14 @@ fn kolmo_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) -> f64 #[cfg(test)] fn test_continious(dist: impl Distribution, cdf: impl Fn(f64) -> f64) { const N_SAMPLES: u64 = 1_000_000; - let mut rng = rand::rngs::SmallRng::seed_from_u64(1); + let mut rng = rand::rngs::SmallRng::seed_from_u64(6); let samples = (0..N_SAMPLES).map(|_| dist.sample(&mut rng)).collect(); let ecdf = Ecdf::new(samples); let ks_statistic = kolmo_smirnov_statistic_continuous(ecdf, cdf); - // It is a heuristic value, we want to prove that the distributions match, so p values don't help us - let critical_value = 1.36 / (N_SAMPLES as f64).sqrt(); + // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001). Passing this does not prove that the sampler is correct but is a good indication. + let critical_value = 1.95 / (N_SAMPLES as f64).sqrt(); println!("KS statistic: {}", ks_statistic); println!("Critical value: {}", critical_value); @@ -100,7 +100,7 @@ where >::Error: std::fmt::Debug, { const N_SAMPLES: u64 = 1_000_000; - let mut rng = rand::rngs::SmallRng::seed_from_u64(1); + let mut rng = rand::rngs::SmallRng::seed_from_u64(2); let samples = (0..N_SAMPLES) .map(|_| dist.sample(&mut rng).try_into().unwrap() as f64) .collect(); @@ -108,7 +108,8 @@ where let ks_statistic = kolmo_smirnov_statistic_discrete(ecdf, cdf); - let critical_value = 1.36 / (N_SAMPLES as f64).sqrt(); + // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001). Passing this does not prove that the sampler is correct but is a good indication. + let critical_value = 1.95 / (N_SAMPLES as f64).sqrt(); println!("KS statistic: {}", ks_statistic); println!("Critical value: {}", critical_value); From 954fd5db96178c0bb71b3296cd1e3083f435a8d3 Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Wed, 25 Sep 2024 09:24:31 +0200 Subject: [PATCH 05/13] test 20 seeds --- rand_distr/tests/cdf.rs | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index bffa6eb638..d9b1127d1b 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -78,15 +78,16 @@ fn kolmo_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) -> f64 } #[cfg(test)] -fn test_continious(dist: impl Distribution, cdf: impl Fn(f64) -> f64) { +fn test_continious(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> f64) { const N_SAMPLES: u64 = 1_000_000; - let mut rng = rand::rngs::SmallRng::seed_from_u64(6); + let mut rng = rand::rngs::SmallRng::seed_from_u64(seed); let samples = (0..N_SAMPLES).map(|_| dist.sample(&mut rng)).collect(); let ecdf = Ecdf::new(samples); let ks_statistic = kolmo_smirnov_statistic_continuous(ecdf, cdf); - // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001). Passing this does not prove that the sampler is correct but is a good indication. + // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001). + // Passing this does not prove that the sampler is correct but is a good indication. let critical_value = 1.95 / (N_SAMPLES as f64).sqrt(); println!("KS statistic: {}", ks_statistic); @@ -95,12 +96,12 @@ fn test_continious(dist: impl Distribution, cdf: impl Fn(f64) -> f64) { } #[cfg(test)] -fn test_discrete>(dist: impl Distribution, cdf: impl Fn(i64) -> f64) +fn test_discrete>(seed: u64, dist: impl Distribution, cdf: impl Fn(i64) -> f64) where >::Error: std::fmt::Debug, { const N_SAMPLES: u64 = 1_000_000; - let mut rng = rand::rngs::SmallRng::seed_from_u64(2); + let mut rng = rand::rngs::SmallRng::seed_from_u64(seed); let samples = (0..N_SAMPLES) .map(|_| dist.sample(&mut rng).try_into().unwrap() as f64) .collect(); @@ -118,16 +119,20 @@ where #[test] fn normal() { - test_continious(Normal::new(0.0, 1.0).unwrap(), |x| { - statrs::distribution::Normal::new(0.0, 1.0).unwrap().cdf(x) - }); + for seed in 1..20 { + test_continious(seed, Normal::new(0.0, 1.0).unwrap(), |x| { + statrs::distribution::Normal::new(0.0, 1.0).unwrap().cdf(x) + }); + } } #[test] fn binomial() { - test_discrete(rand_distr::Binomial::new(10, 0.5).unwrap(), |x| { - statrs::distribution::Binomial::new(0.5, 10) - .unwrap() - .cdf(x as u64) - }); + for seed in 1..20 { + test_discrete(seed, rand_distr::Binomial::new(10, 0.5).unwrap(), |x| { + statrs::distribution::Binomial::new(0.5, 10) + .unwrap() + .cdf(x as u64) + }); + } } From 2367b8a5a31220901ea64afae98a3fdaa2a6c5fe Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Mon, 30 Sep 2024 16:17:55 +0200 Subject: [PATCH 06/13] fixed both KS statistic functions --- rand_distr/tests/cdf.rs | 62 ++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index d9b1127d1b..bb96803eb7 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -11,6 +11,10 @@ use rand_distr::{Distribution, Normal}; use statrs::distribution::ContinuousCDF; use statrs::distribution::DiscreteCDF; +// [1] Nonparametric Goodness-of-Fit Tests for Discrete Null Distributions +// by Taylor B. Arnold and John W. Emerson +// http://www.stat.yale.edu/~jay/EmersonMaterials/DiscreteGOF.pdf + /// Empirical Cumulative Distribution Function (ECDF) struct Ecdf { sorted_samples: Vec, @@ -26,7 +30,7 @@ impl Ecdf { /// Returns the step points of the ECDF /// The ECDF is a step function that increases by 1/n at each sample point - /// The function is continoius from the right, so we give the bigger value at the step points + /// The function is continuous from the right, so we give the bigger value at the step points /// First point is (-inf, 0.0), last point is (max(samples), 1.0) fn step_points(&self) -> Vec<(f64, f64)> { let mut points = Vec::with_capacity(self.sorted_samples.len() + 1); @@ -45,46 +49,48 @@ impl Ecdf { } } -fn kolmo_smirnov_statistic_continuous(ecdf: Ecdf, cdf: impl Fn(f64) -> f64) -> f64 { - let mut max_diff = 0.; - // The maximum will always be at a step point, because the cdf is continious monotonic increasing - let step_points = ecdf.step_points(); - for i in 0..step_points.len() - 1 { - // This shift is because we want the value of the ecdf at (x_{i + 1} - epsilon) = ecdf(x_i) and comare it to cdf(x_{i + 1}) - // cdf(x_{i + 1} - epsilon) = cdf(x_{i+1}) because cdf is continious - let x = step_points[i + 1].0; - let diff = (step_points[i].1 - cdf(x)).abs(); - - if diff > max_diff { - max_diff = diff; - } +fn kolmogorov_smirnov_statistic_continuous(ecdf: Ecdf, cdf: impl Fn(f64) -> f64) -> f64 { + // We implement equation (3) from [1] + + let mut max_diff: f64 = 0.; + + let step_points = ecdf.step_points(); // x_i in the paper + for i in 1..step_points.len() { + let (x_i, f_i) = step_points[i]; + let (_, f_i_1) = step_points[i - 1]; + let max_1 = (cdf(x_i) - f_i).abs(); + let max_2 = (cdf(x_i) - f_i_1).abs(); + + max_diff = max_diff.max(max_1).max(max_2); } max_diff } -fn kolmo_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) -> f64 { - let mut max_diff = 0.; +fn kolmogorov_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) -> f64 { + // We implement equation (4) from [1] + + let mut max_diff: f64 = 0.; - // The maximum will always be at a step point, but we have to be careful because the cdf is not continious - // It is actually easier because both are right continious step functions - let step_points = ecdf.step_points(); - for (x, y) in step_points[1..].iter() { - let diff = (*y - cdf(*x as i64)).abs(); - if diff > max_diff { - max_diff = diff; - } + let step_points = ecdf.step_points(); // x_i in the paper + for i in 1..step_points.len() { + let (x_i, f_i) = step_points[i]; + let (_, f_i_1) = step_points[i - 1]; + let max_1 = (cdf(x_i as i64) - f_i).abs(); + let max_2 = (cdf(x_i as i64 - 1) - f_i_1).abs(); // -1 is the same as -epsilon, because we have integer support + + max_diff = max_diff.max(max_1).max(max_2); } max_diff } #[cfg(test)] -fn test_continious(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> f64) { +fn test_continuous(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> f64) { const N_SAMPLES: u64 = 1_000_000; let mut rng = rand::rngs::SmallRng::seed_from_u64(seed); let samples = (0..N_SAMPLES).map(|_| dist.sample(&mut rng)).collect(); let ecdf = Ecdf::new(samples); - let ks_statistic = kolmo_smirnov_statistic_continuous(ecdf, cdf); + let ks_statistic = kolmogorov_smirnov_statistic_continuous(ecdf, cdf); // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001). // Passing this does not prove that the sampler is correct but is a good indication. @@ -107,7 +113,7 @@ where .collect(); let ecdf = Ecdf::new(samples); - let ks_statistic = kolmo_smirnov_statistic_discrete(ecdf, cdf); + let ks_statistic = kolmogorov_smirnov_statistic_discrete(ecdf, cdf); // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001). Passing this does not prove that the sampler is correct but is a good indication. let critical_value = 1.95 / (N_SAMPLES as f64).sqrt(); @@ -120,7 +126,7 @@ where #[test] fn normal() { for seed in 1..20 { - test_continious(seed, Normal::new(0.0, 1.0).unwrap(), |x| { + test_continuous(seed, Normal::new(0.0, 1.0).unwrap(), |x| { statrs::distribution::Normal::new(0.0, 1.0).unwrap().cdf(x) }); } From da35ab2be28dc3dc3aefa6243ac8066a04e0145f Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Mon, 30 Sep 2024 16:28:05 +0200 Subject: [PATCH 07/13] fix binomial cdf, to give 0 for negative numbers --- rand_distr/tests/cdf.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index bb96803eb7..af31a53f60 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -68,7 +68,7 @@ fn kolmogorov_smirnov_statistic_continuous(ecdf: Ecdf, cdf: impl Fn(f64) -> f64) fn kolmogorov_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) -> f64 { // We implement equation (4) from [1] - + let mut max_diff: f64 = 0.; let step_points = ecdf.step_points(); // x_i in the paper @@ -136,9 +136,13 @@ fn normal() { fn binomial() { for seed in 1..20 { test_discrete(seed, rand_distr::Binomial::new(10, 0.5).unwrap(), |x| { - statrs::distribution::Binomial::new(0.5, 10) - .unwrap() - .cdf(x as u64) + if x < 0 { + 0.0 + } else { + statrs::distribution::Binomial::new(0.5, 10) + .unwrap() + .cdf(x as u64) + } }); } } From 72955e30c4776bf17635c800fbd53ec51d2877d9 Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Mon, 30 Sep 2024 17:04:46 +0200 Subject: [PATCH 08/13] diverse parameters, removed some duplicated code --- rand_distr/tests/cdf.rs | 83 +++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index af31a53f60..153b42cf18 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -6,6 +6,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use num_traits::AsPrimitive; use rand::SeedableRng; use rand_distr::{Distribution, Normal}; use statrs::distribution::ContinuousCDF; @@ -83,40 +84,46 @@ fn kolmogorov_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) - max_diff } -#[cfg(test)] -fn test_continuous(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> f64) { - const N_SAMPLES: u64 = 1_000_000; - let mut rng = rand::rngs::SmallRng::seed_from_u64(seed); - let samples = (0..N_SAMPLES).map(|_| dist.sample(&mut rng)).collect(); - let ecdf = Ecdf::new(samples); - - let ks_statistic = kolmogorov_smirnov_statistic_continuous(ecdf, cdf); +const SAMPLE_SIZE: u64 = 1_000_000; +fn critical_value() -> f64 { // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001). // Passing this does not prove that the sampler is correct but is a good indication. - let critical_value = 1.95 / (N_SAMPLES as f64).sqrt(); - - println!("KS statistic: {}", ks_statistic); - println!("Critical value: {}", critical_value); - assert!(ks_statistic < critical_value); + 1.95 / (SAMPLE_SIZE as f64).sqrt() } -#[cfg(test)] -fn test_discrete>(seed: u64, dist: impl Distribution, cdf: impl Fn(i64) -> f64) +fn sample_ecdf(seed: u64, dist: impl Distribution) -> Ecdf where - >::Error: std::fmt::Debug, + T: AsPrimitive, { - const N_SAMPLES: u64 = 1_000_000; let mut rng = rand::rngs::SmallRng::seed_from_u64(seed); - let samples = (0..N_SAMPLES) - .map(|_| dist.sample(&mut rng).try_into().unwrap() as f64) + let samples = (0..SAMPLE_SIZE) + .map(|_| dist.sample(&mut rng).as_()) .collect(); - let ecdf = Ecdf::new(samples); + Ecdf::new(samples) +} + +fn test_continuous(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> f64) { + let ecdf = sample_ecdf(seed, dist); + let ks_statistic = kolmogorov_smirnov_statistic_continuous(ecdf, cdf); + + let critical_value = critical_value(); + + println!("KS statistic: {}", ks_statistic); + println!("Critical value: {}", critical_value); + assert!(ks_statistic < critical_value); +} +fn test_discrete>( + seed: u64, + dist: impl Distribution, + cdf: impl Fn(i64) -> f64, +) { + let ecdf = sample_ecdf(seed, dist); let ks_statistic = kolmogorov_smirnov_statistic_discrete(ecdf, cdf); - // If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001). Passing this does not prove that the sampler is correct but is a good indication. - let critical_value = 1.95 / (N_SAMPLES as f64).sqrt(); + // This critical value is bigger than it could be for discrete distributions, but because of large sample sizes this should not matter too much + let critical_value = critical_value(); println!("KS statistic: {}", ks_statistic); println!("Critical value: {}", critical_value); @@ -125,21 +132,41 @@ where #[test] fn normal() { - for seed in 1..20 { - test_continuous(seed, Normal::new(0.0, 1.0).unwrap(), |x| { - statrs::distribution::Normal::new(0.0, 1.0).unwrap().cdf(x) + let parameters = [ + (0.0, 1.0), + (0.0, 0.1), + (1.0, 10.0), + (1.0, 100.0), + (-1.0, 0.00001), + (-1.0, 0.0000001), + ]; + + for (seed, (mean, std_dev)) in parameters.into_iter().enumerate() { + test_continuous(seed as u64, Normal::new(mean, std_dev).unwrap(), |x| { + statrs::distribution::Normal::new(mean, std_dev) + .unwrap() + .cdf(x) }); } } #[test] fn binomial() { - for seed in 1..20 { - test_discrete(seed, rand_distr::Binomial::new(10, 0.5).unwrap(), |x| { + let parameters = [ + (0.5, 10), + (0.5, 100), + (0.1, 10), + (0.0000001, 1000000), + (0.0000001, 10), + (0.9999, 2), + ]; + + for (seed, (p, n)) in parameters.into_iter().enumerate() { + test_discrete(seed as u64, rand_distr::Binomial::new(n, p).unwrap(), |x| { if x < 0 { 0.0 } else { - statrs::distribution::Binomial::new(0.5, 10) + statrs::distribution::Binomial::new(p, n) .unwrap() .cdf(x as u64) } From ea38d1409c2e7c6009ddeec1cab8a85a9a634a3e Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Thu, 3 Oct 2024 16:19:36 +0200 Subject: [PATCH 09/13] own impl of cdf not depending on statrs --- rand_distr/CHANGELOG.md | 1 + rand_distr/tests/cdf.rs | 44 +++++++++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/rand_distr/CHANGELOG.md b/rand_distr/CHANGELOG.md index 51bde39e86..fd13e76a4d 100644 --- a/rand_distr/CHANGELOG.md +++ b/rand_distr/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased - The `serde1` feature has been renamed `serde` (#1477) +- Add Kolmogorov Smirnov test for sampling of `Normal` and `Binomial` (#1494) ### Added - Add plots for `rand_distr` distributions to documentation (#1434) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index 153b42cf18..3013693171 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -6,11 +6,13 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use core::f64; + use num_traits::AsPrimitive; use rand::SeedableRng; use rand_distr::{Distribution, Normal}; -use statrs::distribution::ContinuousCDF; -use statrs::distribution::DiscreteCDF; +use special::Beta; +use special::Primitive; // [1] Nonparametric Goodness-of-Fit Tests for Discrete Null Distributions // by Taylor B. Arnold and John W. Emerson @@ -130,6 +132,10 @@ fn test_discrete>( assert!(ks_statistic < critical_value); } +fn normal_cdf(x: f64, mean: f64, std_dev: f64) -> f64 { + 0.5 * (1.0 + ((x - mean) / (std_dev * f64::consts::SQRT_2)).erf()) +} + #[test] fn normal() { let parameters = [ @@ -143,13 +149,31 @@ fn normal() { for (seed, (mean, std_dev)) in parameters.into_iter().enumerate() { test_continuous(seed as u64, Normal::new(mean, std_dev).unwrap(), |x| { - statrs::distribution::Normal::new(mean, std_dev) - .unwrap() - .cdf(x) + normal_cdf(x, mean, std_dev) }); } } +fn binomial_cdf(k: i64, p: f64, n: u64) -> f64 { + if k < 0 { + return 0.0; + } + if k >= n as i64 { + return 1.0; + } + + let a = n as f64 - k as f64; + let b = k as f64 + 1.0; + + let q = 1.0 - p; + + let ln_beta_ab = a.ln_beta(b); + + let reg_incomplete_beta = q.inc_beta(a, b, ln_beta_ab); + + return reg_incomplete_beta; +} + #[test] fn binomial() { let parameters = [ @@ -162,14 +186,8 @@ fn binomial() { ]; for (seed, (p, n)) in parameters.into_iter().enumerate() { - test_discrete(seed as u64, rand_distr::Binomial::new(n, p).unwrap(), |x| { - if x < 0 { - 0.0 - } else { - statrs::distribution::Binomial::new(p, n) - .unwrap() - .cdf(x as u64) - } + test_discrete(seed as u64, rand_distr::Binomial::new(n, p).unwrap(), |k| { + binomial_cdf(k, p, n) }); } } From 17f3565d4c0e4c4eb5aebf0fe2a6f0b8739d30e2 Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Thu, 3 Oct 2024 16:22:52 +0200 Subject: [PATCH 10/13] fmt + clippy --- rand_distr/tests/cdf.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index 3013693171..2624b2f3e3 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -168,11 +168,9 @@ fn binomial_cdf(k: i64, p: f64, n: u64) -> f64 { let q = 1.0 - p; let ln_beta_ab = a.ln_beta(b); - - let reg_incomplete_beta = q.inc_beta(a, b, ln_beta_ab); - return reg_incomplete_beta; -} + q.inc_beta(a, b, ln_beta_ab) +} #[test] fn binomial() { From 86e8d81ab569f8a896344e5dac9fd16ed2e30fbb Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Mon, 7 Oct 2024 15:33:04 +0200 Subject: [PATCH 11/13] review --- rand_distr/tests/cdf.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index 2624b2f3e3..438b71ec93 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -61,8 +61,9 @@ fn kolmogorov_smirnov_statistic_continuous(ecdf: Ecdf, cdf: impl Fn(f64) -> f64) for i in 1..step_points.len() { let (x_i, f_i) = step_points[i]; let (_, f_i_1) = step_points[i - 1]; - let max_1 = (cdf(x_i) - f_i).abs(); - let max_2 = (cdf(x_i) - f_i_1).abs(); + let cdf_i = cdf(x_i); + let max_1 = (cdf_i - f_i).abs(); + let max_2 = (cdf_i - f_i_1).abs(); max_diff = max_diff.max(max_1).max(max_2); } @@ -105,7 +106,9 @@ where Ecdf::new(samples) } -fn test_continuous(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> f64) { +/// Tests a distribution against an analytical CDF. +/// The cdf has to be continuous. +pub fn test_continuous(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> f64) { let ecdf = sample_ecdf(seed, dist); let ks_statistic = kolmogorov_smirnov_statistic_continuous(ecdf, cdf); @@ -116,7 +119,9 @@ fn test_continuous(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> assert!(ks_statistic < critical_value); } -fn test_discrete>( +/// Tests a distribution over integers against an analytical CDF. +/// The analytical CDF must not have jump points which are not integers. +pub fn test_discrete>( seed: u64, dist: impl Distribution, cdf: impl Fn(i64) -> f64, From 32be84f356aa184a305c1c1209f8b442088b379b Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Mon, 7 Oct 2024 15:34:17 +0200 Subject: [PATCH 12/13] capitalize CDF consistency in doc --- rand_distr/tests/cdf.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index 438b71ec93..fc74636c1e 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -107,7 +107,7 @@ where } /// Tests a distribution against an analytical CDF. -/// The cdf has to be continuous. +/// The CDF has to be continuous. pub fn test_continuous(seed: u64, dist: impl Distribution, cdf: impl Fn(f64) -> f64) { let ecdf = sample_ecdf(seed, dist); let ks_statistic = kolmogorov_smirnov_statistic_continuous(ecdf, cdf); From 62dd2be19977202bb414174b16c1119323d76963 Mon Sep 17 00:00:00 2001 From: Benjamin Lieser Date: Tue, 8 Oct 2024 10:04:21 +0200 Subject: [PATCH 13/13] review --- rand_distr/tests/cdf.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rand_distr/tests/cdf.rs b/rand_distr/tests/cdf.rs index fc74636c1e..71b808d241 100644 --- a/rand_distr/tests/cdf.rs +++ b/rand_distr/tests/cdf.rs @@ -138,7 +138,7 @@ pub fn test_discrete>( } fn normal_cdf(x: f64, mean: f64, std_dev: f64) -> f64 { - 0.5 * (1.0 + ((x - mean) / (std_dev * f64::consts::SQRT_2)).erf()) + 0.5 * ((mean - x) / (std_dev * f64::consts::SQRT_2)).erfc() } #[test] @@ -163,11 +163,12 @@ fn binomial_cdf(k: i64, p: f64, n: u64) -> f64 { if k < 0 { return 0.0; } - if k >= n as i64 { + let k = k as u64; + if k >= n { return 1.0; } - let a = n as f64 - k as f64; + let a = (n - k) as f64; let b = k as f64 + 1.0; let q = 1.0 - p;