Skip to content

Commit

Permalink
Hoeffding's D
Browse files Browse the repository at this point in the history
  • Loading branch information
chungg committed Aug 13, 2024
1 parent 7ea0cc1 commit de24849
Show file tree
Hide file tree
Showing 5 changed files with 264 additions and 28 deletions.
14 changes: 14 additions & 0 deletions benches/traquer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,9 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("correlation-krcc", |b| {
b.iter(|| black_box(correlation::krcc(&stats.close, &stats.close, 16).collect::<Vec<_>>()))

Check warning on line 604 in benches/traquer.rs

View workflow job for this annotation

GitHub Actions / check-style

Diff in /home/runner/work/traquer/traquer/benches/traquer.rs
});
c.bench_function("correlation-hoeffd", |b| {
b.iter(|| black_box(correlation::hoeffd(&stats.close, &stats.close, 16).collect::<Vec<_>>()))
});

c.bench_function("stats-dist-variance", |b| {
b.iter(|| {
Expand Down Expand Up @@ -669,6 +672,17 @@ fn criterion_benchmark(c: &mut Criterion) {
black_box(statistic::regression::mda(&stats.close, &stats.open).collect::<Vec<_>>())
})
});
c.bench_function("blah", |b| {
b.iter(|| {
black_box(
statistic::distribution::rank(
&stats.close,
Some(statistic::distribution::RankMode::Max),
)
.collect::<Vec<_>>(),
)
})
});
}

criterion_group!(benches, criterion_benchmark);
Expand Down
72 changes: 69 additions & 3 deletions src/correlation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::iter;
use itertools::izip;

use crate::smooth;
use crate::statistic::distribution::{cov_stdev, rank};
use crate::statistic::distribution::{cov_stdev, rank, RankMode};

/// Pearson Correlation Coefficient
///
Expand Down Expand Up @@ -270,8 +270,8 @@ pub fn srcc<'a>(
.zip(series2.windows(window))
.map(|(x_win, y_win)| {
let (cov_xy, std_x, std_y) = cov_stdev(
&rank(x_win).collect::<Vec<usize>>(),
&rank(y_win).collect::<Vec<usize>>(),
&rank(x_win, None).collect::<Vec<f64>>(),
&rank(y_win, None).collect::<Vec<f64>>(),
);
cov_xy / (std_x * std_y)
}),
Expand Down Expand Up @@ -335,3 +335,69 @@ pub fn krcc<'a>(
}),
)
}

/// Hoeffding's D
///
/// A test based on the population measure of deviation from independence. More
/// resource-intensive compared to other correlation functions but may handle non-monotonic
/// relationships better.
///
/// ## Usage
///
/// Generates a measure that ranges from -0.5 to 1, where the higher the number is, the
/// more strongly dependent the two sequences are on each other.
///
/// ## Sources
///
/// [[1]](https://github.com/Dicklesworthstone/hoeffdings_d_explainer)
///
/// # Examples
///
/// ```
/// use traquer::correlation;
///
/// correlation::hoeffd(
/// &vec![1.0,2.0,3.0,4.0,5.0,6.0,4.0,5.0],
/// &vec![1.0,2.0,3.0,4.0,5.0,6.0,4.0,5.0],
/// 6).collect::<Vec<f64>>();
///
/// ```
pub fn hoeffd<'a>(
series1: &'a [f64],
series2: &'a [f64],
window: usize,
) -> impl Iterator<Item = f64> + 'a {
iter::repeat(f64::NAN).take(window - 1).chain(
series1
.windows(window)
.zip(series2.windows(window))
.map(move |(x_win, y_win)| {
let rank_x = rank(x_win, Some(RankMode::Average)).collect::<Vec<f64>>();
let rank_y = rank(y_win, Some(RankMode::Average)).collect::<Vec<f64>>();
let mut q = vec![1.0; window];
for i in 0..window {
for j in 0..window {
q[i] += (rank_x[j] < rank_x[i] && rank_y[j] < rank_y[i]) as u8 as f64;
q[i] +=
0.25 * (rank_x[j] == rank_x[i] && rank_y[j] == rank_y[i]) as u8 as f64;
q[i] += 0.5
* ((rank_x[j] == rank_x[i] && rank_y[j] < rank_y[i])
|| (rank_x[j] < rank_x[i] && rank_y[j] == rank_y[i]))
as u8 as f64;
}
q[i] -= 0.25; // accounts for when comparing to itself
}
let d1 = q.iter().fold(0.0, |acc, x| acc + (x - 1.0) * (x - 2.0));
let d2 = rank_x.iter().zip(&rank_y).fold(0.0, |acc, (x, y)| {
acc + (x - 1.0) * (x - 2.0) * (y - 1.0) * (y - 2.0)
});
let d3 = izip!(q, rank_x, rank_y).fold(0.0, |acc, (q, x, y)| {
acc + (x - 2.0) * (y - 2.0) * (q - 1.0)
});
30.0 * (((window - 2) * (window - 3)) as f64 * d1 + d2
- 2. * (window - 2) as f64 * d3)
/ (window * (window - 1) * (window - 2) * (window - 3) * (window - 4)) as f64
}),
)
}
129 changes: 108 additions & 21 deletions src/statistic/distribution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,32 +311,119 @@ pub fn quantile(data: &[f64], window: usize, q: f64) -> impl Iterator<Item = f64
}))
}

pub(crate) fn cov_stdev<'a>(x: &'a [usize], y: &'a [usize]) -> (f64, f64, f64) {
let x_avg = x.iter().fold(0.0, |acc, &x| acc + x as f64) / x.len() as f64;
let y_avg = y.iter().fold(0.0, |acc, &y| acc + y as f64) / y.len() as f64;
pub(crate) fn cov_stdev<'a>(x: &'a [f64], y: &'a [f64]) -> (f64, f64, f64) {
let x_avg = x.iter().fold(0.0, |acc, &x| acc + x) / x.len() as f64;
let y_avg = y.iter().fold(0.0, |acc, &y| acc + y) / y.len() as f64;
(
x.iter().zip(y).fold(0.0, |acc, (&xi, &yi)| {
acc + (xi as f64 - x_avg) * (yi as f64 - y_avg)
}) / (x.len() - 1) as f64,
(x.iter()
.fold(0.0, |acc, &xi| acc + (xi as f64 - x_avg).powi(2))
/ (x.len() - 1) as f64)
.sqrt(),
(y.iter()
.fold(0.0, |acc, &yi| acc + (yi as f64 - y_avg).powi(2))
/ (y.len() - 1) as f64)
.sqrt(),
x.iter()
.zip(y)
.fold(0.0, |acc, (&xi, &yi)| acc + (xi - x_avg) * (yi - y_avg))
/ (x.len() - 1) as f64,
(x.iter().fold(0.0, |acc, &xi| acc + (xi - x_avg).powi(2)) / (x.len() - 1) as f64).sqrt(),
(y.iter().fold(0.0, |acc, &yi| acc + (yi - y_avg).powi(2)) / (y.len() - 1) as f64).sqrt(),
)
}

pub(crate) fn rank(data: &[f64]) -> impl Iterator<Item = usize> + '_ {
/// Rank modes types
pub enum RankMode {
Average,
Dense,
Max,
Min,
Ordinal,
}

trait SortExt<T> {
// probably move this somewhere in future
fn argsort(&self) -> Vec<usize>;
}

impl<T: PartialOrd + Clone> SortExt<T> for Vec<T> {
fn argsort(&self) -> Vec<usize> {
let mut indices = (0..self.len()).collect::<Vec<_>>();
indices.sort_by(|&a, &b| self[a].partial_cmp(&self[b]).unwrap());
indices
}
}

/// Rank
///
/// Assign ranks to data, dealing with ties appropriately. Ranks begin at 1.
///
/// ## Sources
///
/// [[1]](https://en.wikipedia.org/wiki/Ranking_(statistics))
///
/// # Examples
///
/// ```
/// use traquer::statistic::distribution;
///
/// distribution::rank(
/// &[1.0,2.0,3.0,4.0,5.0],
/// Some(distribution::RankMode::Ordinal)
/// ).collect::<Vec<_>>();
/// ```
pub fn rank(data: &[f64], mode: Option<RankMode>) -> Box<dyn Iterator<Item = f64> + '_> {
let mut values = data.to_vec();
values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let mut rank_map: HashMap<u64, usize> = HashMap::new();

for (rank, element) in values.iter().enumerate() {
rank_map.entry(element.to_bits()).or_insert(rank);
match mode {
Some(RankMode::Ordinal) => Box::new(
values
.argsort()
.argsort()
.iter()
.map(|&x| (x + 1) as f64)
.collect::<Vec<f64>>()
.into_iter(),
),
_ => {
values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let mut rank_map: HashMap<u64, f64> = HashMap::new();

match mode {
Some(RankMode::Average) => {
let mut start_idx = 0;
for (idx, &element) in values.iter().enumerate().skip(1) {
if element != values[start_idx] {
rank_map.insert(
values[start_idx].to_bits(),
(start_idx..idx).sum::<usize>() as f64 / (idx - start_idx) as f64,
);
start_idx = idx;
}
}
rank_map.insert(
values[start_idx].to_bits(),
(start_idx..values.len()).sum::<usize>() as f64
/ (values.len() - start_idx) as f64,
);
}
Some(RankMode::Max) => {
let size = values.len() - 1;
values.iter().rev().enumerate().for_each(|(rank, element)| {
rank_map
.entry(element.to_bits())
.or_insert((size - rank) as f64);
});
}
Some(RankMode::Dense) => {

Check warning on line 410 in src/statistic/distribution.rs

View workflow job for this annotation

GitHub Actions / check-style

Diff in /home/runner/work/traquer/traquer/src/statistic/distribution.rs
let mut rank = 0.0;
values.iter().for_each(|element| {
if let std::collections::hash_map::Entry::Vacant(e) = rank_map.entry(element.to_bits()) {
e.insert(rank);
rank += 1.0;
}
});
}
_ => values.iter().enumerate().for_each(|(rank, element)| {
rank_map.entry(element.to_bits()).or_insert(rank as f64);
}),
};
Box::new(
data.iter()
.map(move |x| rank_map.get(&x.to_bits()).unwrap().to_owned() + 1.),
)
}
}
data.iter()
.map(move |x| rank_map.get(&x.to_bits()).unwrap().to_owned())
}
48 changes: 44 additions & 4 deletions tests/correlation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,17 +190,57 @@ fn test_krcc() {
result[16 - 1..]
);
let result = correlation::krcc(
&vec![7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4],
&vec![2.8, 2.9, 2.8, 2.6, 3.5, 4.6, 5.0],
&[7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4],
&[2.8, 2.9, 2.8, 2.6, 3.5, 4.6, 5.0],
7,
)
.collect::<Vec<_>>();
assert_eq!(0.55, result[7 - 1]);
let result = correlation::krcc(
&vec![7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4],
&vec![2.8, 2.8, 2.8, 2.6, 3.5, 4.6, 5.0],
&[7.1, 7.1, 7.2, 8.3, 9.4, 10.5, 11.4],
&[2.8, 2.8, 2.8, 2.6, 3.5, 4.6, 5.0],
7,
)
.collect::<Vec<_>>();
assert_eq!(0.6324555320336759, result[7 - 1]);
}

#[test]
fn test_hoeffd() {
let result = correlation::hoeffd(
&[55., 62., 68., 70., 72., 65., 67., 78., 78., 78.],
&[125., 145., 160., 156., 190., 150., 165., 250., 250., 250.],
10,
)
.collect::<Vec<_>>();
assert_eq!(0.4107142857142857, result[9]);

let stats = common::test_data();
let stats2 = common::test_data_path("./tests/sp500.input");
let result = correlation::hoeffd(&stats.close, &stats2.close, 16).collect::<Vec<_>>();
assert_eq!(stats.close.len(), result.len());
assert_eq!(
vec![
0.12248168498168498,
0.24336080586080586,
0.2976190476190476,
0.3857600732600733,
0.38553113553113555,
0.35393772893772896,
0.3518772893772894,
0.3257783882783883,
0.2564102564102564,
0.23992673992673993,
0.19482600732600733,
0.08333333333333333,
0.0011446886446886447,
0.02197802197802198,
0.03205128205128205,
0.14858058608058608,
0.25984432234432236,
0.3731684981684982,
0.4251373626373626
],
result[16 - 1..]
);
}
29 changes: 29 additions & 0 deletions tests/stat_dist_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,32 @@ fn test_median_odd() {
result[15 - 1..]
);
}

#[test]
fn test_rank() {
let input = &[0., 2., 3., 2., 0., 2.];
assert_eq!(
rank(input, Some(RankMode::Min)).collect::<Vec<_>>(),
rank(input, None).collect::<Vec<_>>()
);
assert_eq!(
vec![1., 3., 6., 3., 1., 3.],
rank(input, None).collect::<Vec<_>>()
);
assert_eq!(
vec![2., 5., 6., 5., 2., 5.],
rank(input, Some(RankMode::Max)).collect::<Vec<_>>()
);
assert_eq!(
vec![1., 3., 6., 4., 2., 5.],
rank(input, Some(RankMode::Ordinal)).collect::<Vec<_>>()
);
assert_eq!(
vec![1.5, 4.0, 6.0, 4.0, 1.5, 4.0],
rank(input, Some(RankMode::Average)).collect::<Vec<_>>()
);
assert_eq!(
vec![1., 2., 3., 2., 1., 2.],
rank(input, Some(RankMode::Dense)).collect::<Vec<_>>()
);
}

0 comments on commit de24849

Please sign in to comment.