Skip to content

Commit

Permalink
hoeffding's d
Browse files Browse the repository at this point in the history
also make rank function public
  • Loading branch information
chungg committed Aug 19, 2024
1 parent 16b1c88 commit 243bb1d
Show file tree
Hide file tree
Showing 5 changed files with 282 additions and 24 deletions.
16 changes: 16 additions & 0 deletions benches/traquer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,11 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("correlation-krcc", |b| {
b.iter(|| black_box(correlation::krcc(&stats.close, &stats.close, 16).collect::<Vec<_>>()))
});
c.bench_function("correlation-hoeffd", |b| {
b.iter(|| {
black_box(correlation::hoeffd(&stats.close, &stats.close, 16).collect::<Vec<_>>())
})
});

c.bench_function("stats-dist-variance", |b| {
b.iter(|| {
Expand Down Expand Up @@ -641,6 +646,17 @@ fn criterion_benchmark(c: &mut Criterion) {
black_box(statistic::distribution::quantile(&stats.close, 16, 90.0).collect::<Vec<_>>())
})
});
c.bench_function("stats-dist-rank", |b| {
b.iter(|| {
black_box(
statistic::distribution::rank(
&stats.close,
Some(statistic::distribution::RankMode::Min),
)
.collect::<Vec<_>>(),
)
})
});

c.bench_function("stats-regress-mse", |b| {
b.iter(|| {
Expand Down
71 changes: 68 additions & 3 deletions src/correlation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::iter;
use itertools::izip;

use crate::smooth;
use crate::statistic::distribution::{cov_stdev, rank};
use crate::statistic::distribution::{cov_stdev, rank, RankMode};

/// Pearson Correlation Coefficient
///
Expand Down Expand Up @@ -270,8 +270,8 @@ pub fn srcc<'a>(
.zip(series2.windows(window))
.map(|(x_win, y_win)| {
let (cov_xy, std_x, std_y) = cov_stdev(
&rank(x_win).collect::<Vec<usize>>(),
&rank(y_win).collect::<Vec<usize>>(),
&rank(x_win, None).collect::<Vec<f64>>(),
&rank(y_win, None).collect::<Vec<f64>>(),
);
cov_xy / (std_x * std_y)
}),
Expand Down Expand Up @@ -335,3 +335,68 @@ pub fn krcc<'a>(
}),
)
}

/// Hoeffding's D
///
/// A test based on the population measure of deviation from independence. More
/// resource-intensive compared to other correlation functions but may handle non-monotonic
/// relationships better.
///
/// ## Usage
///
/// Generates a measure that ranges from -0.5 to 1, where the higher the number is, the
/// more strongly dependent the two sequences are on each other.
///
/// ## Sources
///
/// [[1]](https://github.com/Dicklesworthstone/hoeffdings_d_explainer)
///
/// # Examples
///
/// ```
/// use traquer::correlation;
///
/// correlation::hoeffd(
/// &vec![1.0,2.0,3.0,4.0,5.0,6.0,4.0,5.0],
/// &vec![1.0,2.0,3.0,4.0,5.0,6.0,4.0,5.0],
/// 6).collect::<Vec<f64>>();
///
/// ```
pub fn hoeffd<'a>(
series1: &'a [f64],
series2: &'a [f64],
window: usize,
) -> impl Iterator<Item = f64> + 'a {
iter::repeat(f64::NAN).take(window - 1).chain(
series1
.windows(window)
.zip(series2.windows(window))
.map(move |(x_win, y_win)| {
let rank_x = rank(x_win, Some(RankMode::Average)).collect::<Vec<f64>>();
let rank_y = rank(y_win, Some(RankMode::Average)).collect::<Vec<f64>>();
let mut q = vec![1.0; window];
for i in 0..window {
for j in 0..window {
q[i] += (rank_x[j] < rank_x[i] && rank_y[j] < rank_y[i]) as u8 as f64;
q[i] +=
0.25 * (rank_x[j] == rank_x[i] && rank_y[j] == rank_y[i]) as u8 as f64;
q[i] += 0.5
* ((rank_x[j] == rank_x[i] && rank_y[j] < rank_y[i])
|| (rank_x[j] < rank_x[i] && rank_y[j] == rank_y[i]))
as u8 as f64;
}
q[i] -= 0.25; // accounts for when comparing to itself
}
let d1 = q.iter().fold(0.0, |acc, x| acc + (x - 1.0) * (x - 2.0));
let d2 = rank_x.iter().zip(&rank_y).fold(0.0, |acc, (x, y)| {
acc + (x - 1.0) * (x - 2.0) * (y - 1.0) * (y - 2.0)
});
let d3 = izip!(q, rank_x, rank_y).fold(0.0, |acc, (q, x, y)| {
acc + (x - 2.0) * (y - 2.0) * (q - 1.0)
});
30.0 * (((window - 2) * (window - 3)) as f64 * d1 + d2
- 2. * (window - 2) as f64 * d3)
/ (window * (window - 1) * (window - 2) * (window - 3) * (window - 4)) as f64
}),
)
}
150 changes: 129 additions & 21 deletions src/statistic/distribution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,32 +311,140 @@ pub fn quantile(data: &[f64], window: usize, q: f64) -> impl Iterator<Item = f64
}))
}

pub(crate) fn cov_stdev<'a>(x: &'a [usize], y: &'a [usize]) -> (f64, f64, f64) {
let x_avg = x.iter().fold(0.0, |acc, &x| acc + x as f64) / x.len() as f64;
let y_avg = y.iter().fold(0.0, |acc, &y| acc + y as f64) / y.len() as f64;
pub(crate) fn cov_stdev<'a>(x: &'a [f64], y: &'a [f64]) -> (f64, f64, f64) {
let x_avg = x.iter().fold(0.0, |acc, &x| acc + x) / x.len() as f64;
let y_avg = y.iter().fold(0.0, |acc, &y| acc + y) / y.len() as f64;
(
x.iter().zip(y).fold(0.0, |acc, (&xi, &yi)| {
acc + (xi as f64 - x_avg) * (yi as f64 - y_avg)
}) / (x.len() - 1) as f64,
(x.iter()
.fold(0.0, |acc, &xi| acc + (xi as f64 - x_avg).powi(2))
/ (x.len() - 1) as f64)
.sqrt(),
(y.iter()
.fold(0.0, |acc, &yi| acc + (yi as f64 - y_avg).powi(2))
/ (y.len() - 1) as f64)
.sqrt(),
x.iter()
.zip(y)
.fold(0.0, |acc, (&xi, &yi)| acc + (xi - x_avg) * (yi - y_avg))
/ (x.len() - 1) as f64,
(x.iter().fold(0.0, |acc, &xi| acc + (xi - x_avg).powi(2)) / (x.len() - 1) as f64).sqrt(),
(y.iter().fold(0.0, |acc, &yi| acc + (yi - y_avg).powi(2)) / (y.len() - 1) as f64).sqrt(),
)
}

pub(crate) fn rank(data: &[f64]) -> impl Iterator<Item = usize> + '_ {
/// Rank modes types
pub enum RankMode {
Average,
Dense,
Max,
Min,
Ordinal,
}

// trait SortExt<T> {
// // probably move this somewhere in future
// fn argsort(&self) -> Vec<usize>;
// }
//
// impl<T: PartialOrd + Clone> SortExt<T> for Vec<T> {
// fn argsort(&self) -> Vec<usize> {
// let mut indices = (0..self.len()).collect::<Vec<_>>();
// indices.sort_by(|&a, &b| self[a].partial_cmp(&self[b]).unwrap());
// indices
// }
// }

/// Rank
///
/// Assign ranks to data, dealing with ties appropriately. Ranks begin at 1.
///
/// Different ranking options are available:
///
/// - `Average`: The average of the ranks that would have been assigned to all the tied values is
/// assigned to each value.
/// - `Min`: The minimum of the ranks that would have been assigned to all the tied values is
/// assigned to each value. (This is also referred to as “competition” ranking.)
/// - `Max`: The maximum of the ranks that would have been assigned to all the tied values is
/// assigned to each value.
/// - `Dense`: Like `Min`, but the rank of the next highest element is assigned the rank
/// immediately after those assigned to the tied elements.
/// - `Ordinal`: All values are given a distinct rank, corresponding to the order that the values
/// occur in input.
///
/// ## Sources
///
/// [[1]](https://en.wikipedia.org/wiki/Ranking_(statistics))
///
/// # Examples
///
/// ```
/// use traquer::statistic::distribution;
///
/// distribution::rank(
/// &[1.0,2.0,3.0,4.0,5.0],
/// Some(distribution::RankMode::Ordinal)
/// ).collect::<Vec<_>>();
/// ```
pub fn rank(data: &[f64], mode: Option<RankMode>) -> Box<dyn Iterator<Item = f64> + '_> {
let mut values = data.to_vec();
values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let mut rank_map: HashMap<u64, usize> = HashMap::new();

for (rank, element) in values.iter().enumerate() {
rank_map.entry(element.to_bits()).or_insert(rank);
match mode {
Some(RankMode::Ordinal) => {
// argsort().argsort().iter().map(|&x| (x + 1) as f64) saves memory but is slower
values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let mut rank_map: HashMap<u64, (f64, f64)> = HashMap::new();
values.iter().enumerate().for_each(|(rank, element)| {
rank_map
.entry(element.to_bits())
.or_insert((rank as f64, 0.));
});
Box::new(data.iter().map(move |x| {
let entry = rank_map.get(&x.to_bits()).unwrap().to_owned();
rank_map.insert(x.to_bits(), (entry.0, entry.1 + 1.));
entry.0 + entry.1 + 1.
}))
}
_ => {
values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let mut rank_map: HashMap<u64, f64> = HashMap::new();

match mode {
Some(RankMode::Average) => {
let mut start_idx = 0;
for (idx, &element) in values.iter().enumerate().skip(1) {
if element != values[start_idx] {
rank_map.insert(
values[start_idx].to_bits(),
(start_idx..idx).sum::<usize>() as f64 / (idx - start_idx) as f64,
);
start_idx = idx;
}
}
rank_map.insert(
values[start_idx].to_bits(),
(start_idx..values.len()).sum::<usize>() as f64
/ (values.len() - start_idx) as f64,
);
}
Some(RankMode::Max) => {
let size = values.len() - 1;
values.iter().rev().enumerate().for_each(|(rank, element)| {
rank_map
.entry(element.to_bits())
.or_insert((size - rank) as f64);
});
}
Some(RankMode::Dense) => {
let mut rank = 0.0;
values.iter().for_each(|element| {
if let std::collections::hash_map::Entry::Vacant(e) =
rank_map.entry(element.to_bits())
{
e.insert(rank);
rank += 1.0;
}
});
}
_ => values.iter().enumerate().for_each(|(rank, element)| {
rank_map.entry(element.to_bits()).or_insert(rank as f64);
}),
};
Box::new(
data.iter()
.map(move |x| rank_map.get(&x.to_bits()).unwrap().to_owned() + 1.),
)
}
}
data.iter()
.map(move |x| rank_map.get(&x.to_bits()).unwrap().to_owned())
}
40 changes: 40 additions & 0 deletions tests/correlation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,43 @@ fn test_krcc() {
.collect::<Vec<_>>();
assert_eq!(0.6324555320336759, result[7 - 1]);
}

#[test]
fn test_hoeffd() {
let result = correlation::hoeffd(
&[55., 62., 68., 70., 72., 65., 67., 78., 78., 78.],
&[125., 145., 160., 156., 190., 150., 165., 250., 250., 250.],
10,
)
.collect::<Vec<_>>();
assert_eq!(0.4107142857142857, result[9]);

let stats = common::test_data();
let stats2 = common::test_data_path("./tests/sp500.input");
let result = correlation::hoeffd(&stats.close, &stats2.close, 16).collect::<Vec<_>>();
assert_eq!(stats.close.len(), result.len());
assert_eq!(
vec![
0.12248168498168498,
0.24336080586080586,
0.2976190476190476,
0.3857600732600733,
0.38553113553113555,
0.35393772893772896,
0.3518772893772894,
0.3257783882783883,
0.2564102564102564,
0.23992673992673993,
0.19482600732600733,
0.08333333333333333,
0.0011446886446886447,
0.02197802197802198,
0.03205128205128205,
0.14858058608058608,
0.25984432234432236,
0.3731684981684982,
0.4251373626373626
],
result[16 - 1..]
);
}
29 changes: 29 additions & 0 deletions tests/stat_dist_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,32 @@ fn test_median_odd() {
result[15 - 1..]
);
}

#[test]
fn test_rank() {
let input = &[0., 2., 3., 2., 0., 2.];
assert_eq!(
rank(input, Some(RankMode::Min)).collect::<Vec<_>>(),
rank(input, None).collect::<Vec<_>>()
);
assert_eq!(
vec![1., 3., 6., 3., 1., 3.],
rank(input, None).collect::<Vec<_>>()
);
assert_eq!(
vec![2., 5., 6., 5., 2., 5.],
rank(input, Some(RankMode::Max)).collect::<Vec<_>>()
);
assert_eq!(
vec![1., 3., 6., 4., 2., 5.],
rank(input, Some(RankMode::Ordinal)).collect::<Vec<_>>()
);
assert_eq!(
vec![1.5, 4.0, 6.0, 4.0, 1.5, 4.0],
rank(input, Some(RankMode::Average)).collect::<Vec<_>>()
);
assert_eq!(
vec![1., 2., 3., 2., 1., 2.],
rank(input, Some(RankMode::Dense)).collect::<Vec<_>>()
);
}

0 comments on commit 243bb1d

Please sign in to comment.