From de24849da6aefb31bfef1800b693f9a5265c641c Mon Sep 17 00:00:00 2001 From: gord chung <5091603+chungg@users.noreply.github.com> Date: Tue, 30 Jul 2024 14:54:31 -0400 Subject: [PATCH] Hoeffding's D --- benches/traquer.rs | 14 ++++ src/correlation.rs | 72 ++++++++++++++++++- src/statistic/distribution.rs | 129 ++++++++++++++++++++++++++++------ tests/correlation_test.rs | 48 +++++++++++-- tests/stat_dist_test.rs | 29 ++++++++ 5 files changed, 264 insertions(+), 28 deletions(-) diff --git a/benches/traquer.rs b/benches/traquer.rs index 6fb53d8..eb1f879 100644 --- a/benches/traquer.rs +++ b/benches/traquer.rs @@ -603,6 +603,9 @@ fn criterion_benchmark(c: &mut Criterion) { c.bench_function("correlation-krcc", |b| { b.iter(|| black_box(correlation::krcc(&stats.close, &stats.close, 16).collect::>())) }); + c.bench_function("correlation-hoeffd", |b| { + b.iter(|| black_box(correlation::hoeffd(&stats.close, &stats.close, 16).collect::>())) + }); c.bench_function("stats-dist-variance", |b| { b.iter(|| { @@ -669,6 +672,17 @@ fn criterion_benchmark(c: &mut Criterion) { black_box(statistic::regression::mda(&stats.close, &stats.open).collect::>()) }) }); + c.bench_function("blah", |b| { + b.iter(|| { + black_box( + statistic::distribution::rank( + &stats.close, + Some(statistic::distribution::RankMode::Max), + ) + .collect::>(), + ) + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/src/correlation.rs b/src/correlation.rs index 686961a..8e31ae6 100644 --- a/src/correlation.rs +++ b/src/correlation.rs @@ -8,7 +8,7 @@ use std::iter; use itertools::izip; use crate::smooth; -use crate::statistic::distribution::{cov_stdev, rank}; +use crate::statistic::distribution::{cov_stdev, rank, RankMode}; /// Pearson Correlation Coefficient /// @@ -270,8 +270,8 @@ pub fn srcc<'a>( .zip(series2.windows(window)) .map(|(x_win, y_win)| { let (cov_xy, std_x, std_y) = cov_stdev( - &rank(x_win).collect::>(), - &rank(y_win).collect::>(), + &rank(x_win, None).collect::>(), + &rank(y_win, None).collect::>(), ); cov_xy / (std_x * std_y) }), @@ -335,3 +335,69 @@ pub fn krcc<'a>( }), ) } + +/// Hoeffding's D +/// +/// A test based on the population measure of deviation from independence. More +/// resource-intensive compared to other correlation functions but may handle non-monotonic +/// relationships better. +/// +/// ## Usage +/// +/// Generates a measure that ranges from -0.5 to 1, where the higher the number is, the +/// more strongly dependent the two sequences are on each other. +/// +/// ## Sources +/// +/// [[1]](https://github.com/Dicklesworthstone/hoeffdings_d_explainer) +/// +/// # Examples +/// +/// ``` +/// use traquer::correlation; +/// +/// correlation::hoeffd( +/// &vec![1.0,2.0,3.0,4.0,5.0,6.0,4.0,5.0], +/// &vec![1.0,2.0,3.0,4.0,5.0,6.0,4.0,5.0], +/// 6).collect::>(); +/// +/// ``` + +pub fn hoeffd<'a>( + series1: &'a [f64], + series2: &'a [f64], + window: usize, +) -> impl Iterator + 'a { + iter::repeat(f64::NAN).take(window - 1).chain( + series1 + .windows(window) + .zip(series2.windows(window)) + .map(move |(x_win, y_win)| { + let rank_x = rank(x_win, Some(RankMode::Average)).collect::>(); + let rank_y = rank(y_win, Some(RankMode::Average)).collect::>(); + let mut q = vec![1.0; window]; + for i in 0..window { + for j in 0..window { + q[i] += (rank_x[j] < rank_x[i] && rank_y[j] < rank_y[i]) as u8 as f64; + q[i] += + 0.25 * (rank_x[j] == rank_x[i] && rank_y[j] == rank_y[i]) as u8 as f64; + q[i] += 0.5 + * ((rank_x[j] == rank_x[i] && rank_y[j] < rank_y[i]) + || (rank_x[j] < rank_x[i] && rank_y[j] == rank_y[i])) + as u8 as f64; + } + q[i] -= 0.25; // accounts for when comparing to itself + } + let d1 = q.iter().fold(0.0, |acc, x| acc + (x - 1.0) * (x - 2.0)); + let d2 = rank_x.iter().zip(&rank_y).fold(0.0, |acc, (x, y)| { + acc + (x - 1.0) * (x - 2.0) * (y - 1.0) * (y - 2.0) + }); + let d3 = izip!(q, rank_x, rank_y).fold(0.0, |acc, (q, x, y)| { + acc + (x - 2.0) * (y - 2.0) * (q - 1.0) + }); + 30.0 * (((window - 2) * (window - 3)) as f64 * d1 + d2 + - 2. * (window - 2) as f64 * d3) + / (window * (window - 1) * (window - 2) * (window - 3) * (window - 4)) as f64 + }), + ) +} diff --git a/src/statistic/distribution.rs b/src/statistic/distribution.rs index 732d529..9459789 100644 --- a/src/statistic/distribution.rs +++ b/src/statistic/distribution.rs @@ -311,32 +311,119 @@ pub fn quantile(data: &[f64], window: usize, q: f64) -> impl Iterator(x: &'a [usize], y: &'a [usize]) -> (f64, f64, f64) { - let x_avg = x.iter().fold(0.0, |acc, &x| acc + x as f64) / x.len() as f64; - let y_avg = y.iter().fold(0.0, |acc, &y| acc + y as f64) / y.len() as f64; +pub(crate) fn cov_stdev<'a>(x: &'a [f64], y: &'a [f64]) -> (f64, f64, f64) { + let x_avg = x.iter().fold(0.0, |acc, &x| acc + x) / x.len() as f64; + let y_avg = y.iter().fold(0.0, |acc, &y| acc + y) / y.len() as f64; ( - x.iter().zip(y).fold(0.0, |acc, (&xi, &yi)| { - acc + (xi as f64 - x_avg) * (yi as f64 - y_avg) - }) / (x.len() - 1) as f64, - (x.iter() - .fold(0.0, |acc, &xi| acc + (xi as f64 - x_avg).powi(2)) - / (x.len() - 1) as f64) - .sqrt(), - (y.iter() - .fold(0.0, |acc, &yi| acc + (yi as f64 - y_avg).powi(2)) - / (y.len() - 1) as f64) - .sqrt(), + x.iter() + .zip(y) + .fold(0.0, |acc, (&xi, &yi)| acc + (xi - x_avg) * (yi - y_avg)) + / (x.len() - 1) as f64, + (x.iter().fold(0.0, |acc, &xi| acc + (xi - x_avg).powi(2)) / (x.len() - 1) as f64).sqrt(), + (y.iter().fold(0.0, |acc, &yi| acc + (yi - y_avg).powi(2)) / (y.len() - 1) as f64).sqrt(), ) } -pub(crate) fn rank(data: &[f64]) -> impl Iterator + '_ { +/// Rank modes types +pub enum RankMode { + Average, + Dense, + Max, + Min, + Ordinal, +} + +trait SortExt { + // probably move this somewhere in future + fn argsort(&self) -> Vec; +} + +impl SortExt for Vec { + fn argsort(&self) -> Vec { + let mut indices = (0..self.len()).collect::>(); + indices.sort_by(|&a, &b| self[a].partial_cmp(&self[b]).unwrap()); + indices + } +} + +/// Rank +/// +/// Assign ranks to data, dealing with ties appropriately. Ranks begin at 1. +/// +/// ## Sources +/// +/// [[1]](https://en.wikipedia.org/wiki/Ranking_(statistics)) +/// +/// # Examples +/// +/// ``` +/// use traquer::statistic::distribution; +/// +/// distribution::rank( +/// &[1.0,2.0,3.0,4.0,5.0], +/// Some(distribution::RankMode::Ordinal) +/// ).collect::>(); +/// ``` +pub fn rank(data: &[f64], mode: Option) -> Box + '_> { let mut values = data.to_vec(); - values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); - let mut rank_map: HashMap = HashMap::new(); - for (rank, element) in values.iter().enumerate() { - rank_map.entry(element.to_bits()).or_insert(rank); + match mode { + Some(RankMode::Ordinal) => Box::new( + values + .argsort() + .argsort() + .iter() + .map(|&x| (x + 1) as f64) + .collect::>() + .into_iter(), + ), + _ => { + values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); + let mut rank_map: HashMap = HashMap::new(); + + match mode { + Some(RankMode::Average) => { + let mut start_idx = 0; + for (idx, &element) in values.iter().enumerate().skip(1) { + if element != values[start_idx] { + rank_map.insert( + values[start_idx].to_bits(), + (start_idx..idx).sum::() as f64 / (idx - start_idx) as f64, + ); + start_idx = idx; + } + } + rank_map.insert( + values[start_idx].to_bits(), + (start_idx..values.len()).sum::() as f64 + / (values.len() - start_idx) as f64, + ); + } + Some(RankMode::Max) => { + let size = values.len() - 1; + values.iter().rev().enumerate().for_each(|(rank, element)| { + rank_map + .entry(element.to_bits()) + .or_insert((size - rank) as f64); + }); + } + Some(RankMode::Dense) => { + let mut rank = 0.0; + values.iter().for_each(|element| { + if let std::collections::hash_map::Entry::Vacant(e) = rank_map.entry(element.to_bits()) { + e.insert(rank); + rank += 1.0; + } + }); + } + _ => values.iter().enumerate().for_each(|(rank, element)| { + rank_map.entry(element.to_bits()).or_insert(rank as f64); + }), + }; + Box::new( + data.iter() + .map(move |x| rank_map.get(&x.to_bits()).unwrap().to_owned() + 1.), + ) + } } - data.iter() - .map(move |x| rank_map.get(&x.to_bits()).unwrap().to_owned()) } diff --git a/tests/correlation_test.rs b/tests/correlation_test.rs index d5d83be..85a5da6 100644 --- a/tests/correlation_test.rs +++ b/tests/correlation_test.rs @@ -190,17 +190,57 @@ fn test_krcc() { result[16 - 1..] ); let result = correlation::krcc( - &vec![7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4], - &vec![2.8, 2.9, 2.8, 2.6, 3.5, 4.6, 5.0], + &[7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4], + &[2.8, 2.9, 2.8, 2.6, 3.5, 4.6, 5.0], 7, ) .collect::>(); assert_eq!(0.55, result[7 - 1]); let result = correlation::krcc( - &vec![7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4], - &vec![2.8, 2.8, 2.8, 2.6, 3.5, 4.6, 5.0], + &[7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4], + &[2.8, 2.8, 2.8, 2.6, 3.5, 4.6, 5.0], 7, ) .collect::>(); assert_eq!(0.6324555320336759, result[7 - 1]); } + +#[test] +fn test_hoeffd() { + let result = correlation::hoeffd( + &[55., 62., 68., 70., 72., 65., 67., 78., 78., 78.], + &[125., 145., 160., 156., 190., 150., 165., 250., 250., 250.], + 10, + ) + .collect::>(); + assert_eq!(0.4107142857142857, result[9]); + + let stats = common::test_data(); + let stats2 = common::test_data_path("./tests/sp500.input"); + let result = correlation::hoeffd(&stats.close, &stats2.close, 16).collect::>(); + assert_eq!(stats.close.len(), result.len()); + assert_eq!( + vec![ + 0.12248168498168498, + 0.24336080586080586, + 0.2976190476190476, + 0.3857600732600733, + 0.38553113553113555, + 0.35393772893772896, + 0.3518772893772894, + 0.3257783882783883, + 0.2564102564102564, + 0.23992673992673993, + 0.19482600732600733, + 0.08333333333333333, + 0.0011446886446886447, + 0.02197802197802198, + 0.03205128205128205, + 0.14858058608058608, + 0.25984432234432236, + 0.3731684981684982, + 0.4251373626373626 + ], + result[16 - 1..] + ); +} diff --git a/tests/stat_dist_test.rs b/tests/stat_dist_test.rs index a85aeed..fefa1bb 100644 --- a/tests/stat_dist_test.rs +++ b/tests/stat_dist_test.rs @@ -359,3 +359,32 @@ fn test_median_odd() { result[15 - 1..] ); } + +#[test] +fn test_rank() { + let input = &[0., 2., 3., 2., 0., 2.]; + assert_eq!( + rank(input, Some(RankMode::Min)).collect::>(), + rank(input, None).collect::>() + ); + assert_eq!( + vec![1., 3., 6., 3., 1., 3.], + rank(input, None).collect::>() + ); + assert_eq!( + vec![2., 5., 6., 5., 2., 5.], + rank(input, Some(RankMode::Max)).collect::>() + ); + assert_eq!( + vec![1., 3., 6., 4., 2., 5.], + rank(input, Some(RankMode::Ordinal)).collect::>() + ); + assert_eq!( + vec![1.5, 4.0, 6.0, 4.0, 1.5, 4.0], + rank(input, Some(RankMode::Average)).collect::>() + ); + assert_eq!( + vec![1., 2., 3., 2., 1., 2.], + rank(input, Some(RankMode::Dense)).collect::>() + ); +}