From de24849da6aefb31bfef1800b693f9a5265c641c Mon Sep 17 00:00:00 2001
From: gord chung <5091603+chungg@users.noreply.github.com>
Date: Tue, 30 Jul 2024 14:54:31 -0400
Subject: [PATCH] Hoeffding's D

---
 benches/traquer.rs            |  14 ++++
 src/correlation.rs            |  72 ++++++++++++++++++-
 src/statistic/distribution.rs | 129 ++++++++++++++++++++++++++++------
 tests/correlation_test.rs     |  48 +++++++++++--
 tests/stat_dist_test.rs       |  29 ++++++++
 5 files changed, 264 insertions(+), 28 deletions(-)
diff --git a/benches/traquer.rs b/benches/traquer.rs
index 6fb53d8..eb1f879 100644
--- a/benches/traquer.rs
+++ b/benches/traquer.rs
@@ -603,6 +603,9 @@ fn criterion_benchmark(c: &mut Criterion) {
     c.bench_function("correlation-krcc", |b| {
         b.iter(|| black_box(correlation::krcc(&stats.close, &stats.close, 16).collect::<Vec<_>>()))
     });
+    c.bench_function("correlation-hoeffd", |b| {
+        b.iter(|| black_box(correlation::hoeffd(&stats.close, &stats.close, 16).collect::<Vec<_>>()))
+    });
 
     c.bench_function("stats-dist-variance", |b| {
         b.iter(|| {
@@ -669,6 +672,17 @@ fn criterion_benchmark(c: &mut Criterion) {
             black_box(statistic::regression::mda(&stats.close, &stats.open).collect::<Vec<_>>())
         })
     });
+    c.bench_function("blah", |b| {
+        b.iter(|| {
+            black_box(
+                statistic::distribution::rank(
+                    &stats.close,
+                    Some(statistic::distribution::RankMode::Max),
+                )
+                .collect::<Vec<_>>(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/src/correlation.rs b/src/correlation.rs
index 686961a..8e31ae6 100644
--- a/src/correlation.rs
+++ b/src/correlation.rs
@@ -8,7 +8,7 @@ use std::iter;
 use itertools::izip;
 
 use crate::smooth;
-use crate::statistic::distribution::{cov_stdev, rank};
+use crate::statistic::distribution::{cov_stdev, rank, RankMode};
 
 /// Pearson Correlation Coefficient
 ///
@@ -270,8 +270,8 @@ pub fn srcc<'a>(
             .zip(series2.windows(window))
             .map(|(x_win, y_win)| {
                 let (cov_xy, std_x, std_y) = cov_stdev(
-                    &rank(x_win).collect::<Vec<usize>>(),
-                    &rank(y_win).collect::<Vec<usize>>(),
+                    &rank(x_win, None).collect::<Vec<f64>>(),
+                    &rank(y_win, None).collect::<Vec<f64>>(),
                 );
                 cov_xy / (std_x * std_y)
             }),
@@ -335,3 +335,69 @@ pub fn krcc<'a>(
             }),
     )
 }
+
+/// Hoeffding's D
+///
+/// A test based on the population measure of deviation from independence. More
+/// resource-intensive compared to other correlation functions but may handle non-monotonic
+/// relationships better.
+///
+/// ## Usage
+///
+/// Generates a measure that ranges from -0.5 to 1, where the higher the number is, the
+/// more strongly dependent the two sequences are on each other.
+///
+/// ## Sources
+///
+/// [[1]](https://github.com/Dicklesworthstone/hoeffdings_d_explainer)
+///
+/// # Examples
+///
+/// ```
+/// use traquer::correlation;
+///
+/// correlation::hoeffd(
+///     &vec![1.0,2.0,3.0,4.0,5.0,6.0,4.0,5.0],
+///     &vec![1.0,2.0,3.0,4.0,5.0,6.0,4.0,5.0],
+///     6).collect::<Vec<f64>>();
+///
+/// ```
+
+pub fn hoeffd<'a>(
+    series1: &'a [f64],
+    series2: &'a [f64],
+    window: usize,
+) -> impl Iterator<Item = f64> + 'a {
+    iter::repeat(f64::NAN).take(window - 1).chain(
+        series1
+            .windows(window)
+            .zip(series2.windows(window))
+            .map(move |(x_win, y_win)| {
+                let rank_x = rank(x_win, Some(RankMode::Average)).collect::<Vec<f64>>();
+                let rank_y = rank(y_win, Some(RankMode::Average)).collect::<Vec<f64>>();
+                let mut q = vec![1.0; window];
+                for i in 0..window {
+                    for j in 0..window {
+                        q[i] += (rank_x[j] < rank_x[i] && rank_y[j] < rank_y[i]) as u8 as f64;
+                        q[i] +=
+                            0.25 * (rank_x[j] == rank_x[i] && rank_y[j] == rank_y[i]) as u8 as f64;
+                        q[i] += 0.5
+                            * ((rank_x[j] == rank_x[i] && rank_y[j] < rank_y[i])
+                                || (rank_x[j] < rank_x[i] && rank_y[j] == rank_y[i]))
+                                as u8 as f64;
+                    }
+                    q[i] -= 0.25; // accounts for when comparing to itself
+                }
+                let d1 = q.iter().fold(0.0, |acc, x| acc + (x - 1.0) * (x - 2.0));
+                let d2 = rank_x.iter().zip(&rank_y).fold(0.0, |acc, (x, y)| {
+                    acc + (x - 1.0) * (x - 2.0) * (y - 1.0) * (y - 2.0)
+                });
+                let d3 = izip!(q, rank_x, rank_y).fold(0.0, |acc, (q, x, y)| {
+                    acc + (x - 2.0) * (y - 2.0) * (q - 1.0)
+                });
+                30.0 * (((window - 2) * (window - 3)) as f64 * d1 + d2
+                    - 2. * (window - 2) as f64 * d3)
+                    / (window * (window - 1) * (window - 2) * (window - 3) * (window - 4)) as f64
+            }),
+    )
+}
diff --git a/src/statistic/distribution.rs b/src/statistic/distribution.rs
index 732d529..9459789 100644
--- a/src/statistic/distribution.rs
+++ b/src/statistic/distribution.rs
@@ -311,32 +311,119 @@ pub fn quantile(data: &[f64], window: usize, q: f64) -> impl Iterator<Item = f64
         }))
 }
 
-pub(crate) fn cov_stdev<'a>(x: &'a [usize], y: &'a [usize]) -> (f64, f64, f64) {
-    let x_avg = x.iter().fold(0.0, |acc, &x| acc + x as f64) / x.len() as f64;
-    let y_avg = y.iter().fold(0.0, |acc, &y| acc + y as f64) / y.len() as f64;
+pub(crate) fn cov_stdev<'a>(x: &'a [f64], y: &'a [f64]) -> (f64, f64, f64) {
+    let x_avg = x.iter().fold(0.0, |acc, &x| acc + x) / x.len() as f64;
+    let y_avg = y.iter().fold(0.0, |acc, &y| acc + y) / y.len() as f64;
     (
-        x.iter().zip(y).fold(0.0, |acc, (&xi, &yi)| {
-            acc + (xi as f64 - x_avg) * (yi as f64 - y_avg)
-        }) / (x.len() - 1) as f64,
-        (x.iter()
-            .fold(0.0, |acc, &xi| acc + (xi as f64 - x_avg).powi(2))
-            / (x.len() - 1) as f64)
-            .sqrt(),
-        (y.iter()
-            .fold(0.0, |acc, &yi| acc + (yi as f64 - y_avg).powi(2))
-            / (y.len() - 1) as f64)
-            .sqrt(),
+        x.iter()
+            .zip(y)
+            .fold(0.0, |acc, (&xi, &yi)| acc + (xi - x_avg) * (yi - y_avg))
+            / (x.len() - 1) as f64,
+        (x.iter().fold(0.0, |acc, &xi| acc + (xi - x_avg).powi(2)) / (x.len() - 1) as f64).sqrt(),
+        (y.iter().fold(0.0, |acc, &yi| acc + (yi - y_avg).powi(2)) / (y.len() - 1) as f64).sqrt(),
     )
 }
 
-pub(crate) fn rank(data: &[f64]) -> impl Iterator<Item = usize> + '_ {
+/// Rank modes types
+pub enum RankMode {
+    Average,
+    Dense,
+    Max,
+    Min,
+    Ordinal,
+}
+
+trait SortExt<T> {
+    // probably move this somewhere in future
+    fn argsort(&self) -> Vec<usize>;
+}
+
+impl<T: PartialOrd + Clone> SortExt<T> for Vec<T> {
+    fn argsort(&self) -> Vec<usize> {
+        let mut indices = (0..self.len()).collect::<Vec<_>>();
+        indices.sort_by(|&a, &b| self[a].partial_cmp(&self[b]).unwrap());
+        indices
+    }
+}
+
+/// Rank
+///
+/// Assign ranks to data, dealing with ties appropriately. Ranks begin at 1.
+///
+/// ## Sources
+///
+/// [[1]](https://en.wikipedia.org/wiki/Ranking_(statistics))
+///
+/// # Examples
+///
+/// ```
+/// use traquer::statistic::distribution;
+///
+/// distribution::rank(
+///     &[1.0,2.0,3.0,4.0,5.0],
+///     Some(distribution::RankMode::Ordinal)
+/// ).collect::<Vec<_>>();
+/// ```
+pub fn rank(data: &[f64], mode: Option<RankMode>) -> Box<dyn Iterator<Item = f64> + '_> {
     let mut values = data.to_vec();
-    values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
-    let mut rank_map: HashMap<u64, usize> = HashMap::new();
 
-    for (rank, element) in values.iter().enumerate() {
-        rank_map.entry(element.to_bits()).or_insert(rank);
+    match mode {
+        Some(RankMode::Ordinal) => Box::new(
+            values
+                .argsort()
+                .argsort()
+                .iter()
+                .map(|&x| (x + 1) as f64)
+                .collect::<Vec<f64>>()
+                .into_iter(),
+        ),
+        _ => {
+            values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
+            let mut rank_map: HashMap<u64, f64> = HashMap::new();
+
+            match mode {
+                Some(RankMode::Average) => {
+                    let mut start_idx = 0;
+                    for (idx, &element) in values.iter().enumerate().skip(1) {
+                        if element != values[start_idx] {
+                            rank_map.insert(
+                                values[start_idx].to_bits(),
+                                (start_idx..idx).sum::<usize>() as f64 / (idx - start_idx) as f64,
+                            );
+                            start_idx = idx;
+                        }
+                    }
+                    rank_map.insert(
+                        values[start_idx].to_bits(),
+                        (start_idx..values.len()).sum::<usize>() as f64
+                            / (values.len() - start_idx) as f64,
+                    );
+                }
+                Some(RankMode::Max) => {
+                    let size = values.len() - 1;
+                    values.iter().rev().enumerate().for_each(|(rank, element)| {
+                        rank_map
+                            .entry(element.to_bits())
+                            .or_insert((size - rank) as f64);
+                    });
+                }
+                Some(RankMode::Dense) => {
+                    let mut rank = 0.0;
+                    values.iter().for_each(|element| {
+                        if let std::collections::hash_map::Entry::Vacant(e) = rank_map.entry(element.to_bits()) {
+                            e.insert(rank);
+                            rank += 1.0;
+                        }
+                    });
+                }
+                _ => values.iter().enumerate().for_each(|(rank, element)| {
+                    rank_map.entry(element.to_bits()).or_insert(rank as f64);
+                }),
+            };
+            Box::new(
+                data.iter()
+                    .map(move |x| rank_map.get(&x.to_bits()).unwrap().to_owned() + 1.),
+            )
+        }
     }
-    data.iter()
-        .map(move |x| rank_map.get(&x.to_bits()).unwrap().to_owned())
 }
diff --git a/tests/correlation_test.rs b/tests/correlation_test.rs
index d5d83be..85a5da6 100644
--- a/tests/correlation_test.rs
+++ b/tests/correlation_test.rs
@@ -190,17 +190,57 @@ fn test_krcc() {
         result[16 - 1..]
     );
     let result = correlation::krcc(
-        &vec![7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4],
-        &vec![2.8, 2.9, 2.8, 2.6, 3.5, 4.6, 5.0],
+        &[7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4],
+        &[2.8, 2.9, 2.8, 2.6, 3.5, 4.6, 5.0],
         7,
     )
     .collect::<Vec<_>>();
     assert_eq!(0.55, result[7 - 1]);
     let result = correlation::krcc(
-        &vec![7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4],
-        &vec![2.8, 2.8, 2.8, 2.6, 3.5, 4.6, 5.0],
+        &[7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4],
+        &[2.8, 2.8, 2.8, 2.6, 3.5, 4.6, 5.0],
         7,
     )
     .collect::<Vec<_>>();
     assert_eq!(0.6324555320336759, result[7 - 1]);
 }
+
+#[test]
+fn test_hoeffd() {
+    let result = correlation::hoeffd(
+        &[55., 62., 68., 70., 72., 65., 67., 78., 78., 78.],
+        &[125., 145., 160., 156., 190., 150., 165., 250., 250., 250.],
+        10,
+    )
+    .collect::<Vec<_>>();
+    assert_eq!(0.4107142857142857, result[9]);
+
+    let stats = common::test_data();
+    let stats2 = common::test_data_path("./tests/sp500.input");
+    let result = correlation::hoeffd(&stats.close, &stats2.close, 16).collect::<Vec<_>>();
+    assert_eq!(stats.close.len(), result.len());
+    assert_eq!(
+        vec![
+            0.12248168498168498,
+            0.24336080586080586,
+            0.2976190476190476,
+            0.3857600732600733,
+            0.38553113553113555,
+            0.35393772893772896,
+            0.3518772893772894,
+            0.3257783882783883,
+            0.2564102564102564,
+            0.23992673992673993,
+            0.19482600732600733,
+            0.08333333333333333,
+            0.0011446886446886447,
+            0.02197802197802198,
+            0.03205128205128205,
+            0.14858058608058608,
+            0.25984432234432236,
+            0.3731684981684982,
+            0.4251373626373626
+        ],
+        result[16 - 1..]
+    );
+}
diff --git a/tests/stat_dist_test.rs b/tests/stat_dist_test.rs
index a85aeed..fefa1bb 100644
--- a/tests/stat_dist_test.rs
+++ b/tests/stat_dist_test.rs
@@ -359,3 +359,32 @@ fn test_median_odd() {
         result[15 - 1..]
     );
 }
+
+#[test]
+fn test_rank() {
+    let input = &[0., 2., 3., 2., 0., 2.];
+    assert_eq!(
+        rank(input, Some(RankMode::Min)).collect::<Vec<_>>(),
+        rank(input, None).collect::<Vec<_>>()
+    );
+    assert_eq!(
+        vec![1., 3., 6., 3., 1., 3.],
+        rank(input, None).collect::<Vec<_>>()
+    );
+    assert_eq!(
+        vec![2., 5., 6., 5., 2., 5.],
+        rank(input, Some(RankMode::Max)).collect::<Vec<_>>()
+    );
+    assert_eq!(
+        vec![1., 3., 6., 4., 2., 5.],
+        rank(input, Some(RankMode::Ordinal)).collect::<Vec<_>>()
+    );
+    assert_eq!(
+        vec![1.5, 4.0, 6.0, 4.0, 1.5, 4.0],
+        rank(input, Some(RankMode::Average)).collect::<Vec<_>>()
+    );
+    assert_eq!(
+        vec![1., 2., 3., 2., 1., 2.],
+        rank(input, Some(RankMode::Dense)).collect::<Vec<_>>()
+    );
+}