Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add efficient SparseVector method for some metrics #235

Merged
merged 12 commits into from
Dec 2, 2021
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
name = "Distances"
uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
version = "0.10.6"
version = "0.10.7"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"

Expand Down
1 change: 1 addition & 0 deletions src/Distances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module Distances

using LinearAlgebra
using Statistics
using SparseArrays: SparseVectorUnion, nonzeroinds, nonzeros, nnz
import StatsAPI: pairwise, pairwise!

export
Expand Down
28 changes: 28 additions & 0 deletions src/bhattacharyya.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,34 @@ end
return sqab, asum, bsum
end

@inline function _bhattacharyya_coeff(a::SparseVectorUnion{<:Number}, b::SparseVectorUnion{<:Number})
anzind = nonzeroinds(a)
bnzind = nonzeroinds(b)
anzval = nonzeros(a)
bnzval = nonzeros(b)
ma = nnz(a)
mb = nnz(b)

ia = 1; ib = 1
eval_op = (a, b) -> sqrt(a * b)
s = zero(eval_op(zero(eltype(a)), zero(eltype(b))))
@inbounds while ia <= ma && ib <= mb
ja = anzind[ia]
jb = bnzind[ib]
if ja == jb
v = eval_op(anzval[ia], bnzval[ib])
s = v + s
ia += 1; ib += 1
elseif ja < jb
ia += 1
else
ib += 1
end
end
# efficient method for sum for SparseVectorView is missing
return s, sum(anzval), sum(bnzval)
end

# Faster pair- and column-wise versions TBD...


Expand Down
47 changes: 47 additions & 0 deletions src/metrics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,53 @@ Base.@propagate_inbounds function _evaluate(d::UnionMetrics, a::AbstractArray, b
end
end

eval_op_a(d, ai, b) = eval_op(d, ai, zero(eltype(b)))
eval_op_b(d, bi, a) = eval_op(d, zero(eltype(a)), bi)

Base.@propagate_inbounds function _evaluate(d::UnionMetrics, a::SparseVectorUnion, b::SparseVectorUnion, ::Nothing)
@boundscheck if length(a) != length(b)
throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
end
if length(a) == 0
return zero(result_type(d, a, b))
end
anzind = nonzeroinds(a)
bnzind = nonzeroinds(b)
anzval = nonzeros(a)
bnzval = nonzeros(b)
ma = nnz(a)
mb = nnz(b)
ia = 1; ib = 1
s = eval_start(d, a, b)
@inbounds while ia <= ma && ib <= mb
ja = anzind[ia]
jb = bnzind[ib]
if ja == jb
v = eval_op(d, anzval[ia], bnzval[ib])
ia += 1; ib += 1
elseif ja < jb
v = eval_op_a(d, anzval[ia], b)
ia += 1
else
v = eval_op_b(d, bnzval[ib], a)
ib += 1
end
s = eval_reduce(d, s, v)
end
@inbounds while ia <= ma
v = eval_op_a(d, anzval[ia], b)
s = eval_reduce(d, s, v)
ia += 1
end
@inbounds while ib <= mb
v = eval_op_b(d, bnzval[ib], a)
s = eval_reduce(d, s, v)
ib += 1
end
return eval_end(d, s)
end


_evaluate(dist::UnionMetrics, a::Number, b::Number, ::Nothing) = eval_end(dist, eval_op(dist, a, b))
function _evaluate(dist::UnionMetrics, a::Number, b::Number, p)
length(p) != 1 && throw(DimensionMismatch("inputs are scalars but parameters have length $(length(p))."))
Expand Down
87 changes: 51 additions & 36 deletions test/test_dists.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Unit tests for Distances

using SparseArrays: sparsevec, sprand

struct FooDist <: PreMetric end # Julia 1.0 Compat: struct definition must be put in global scope

@testset "result_type" begin
Expand Down Expand Up @@ -211,7 +213,7 @@ end
for (_x, _y) in (([4.0, 5.0, 6.0, 7.0], [3.0, 9.0, 8.0, 1.0]),
([4.0, 5.0, 6.0, 7.0], [3. 8.; 9. 1.0]))
x, y = T.(_x), T.(_y)
for (x, y) in ((x, y),
for (x, y) in ((x, y), (sparsevec(x), sparsevec(y)),
(convert(Array{Union{Missing, T}}, x), convert(Array{Union{Missing, T}}, y)),
((Iterators.take(x, 4), Iterators.take(y, 4))), # iterator
(((x[i] for i in 1:length(x)), (y[i] for i in 1:length(y)))), # generator
Expand Down Expand Up @@ -498,41 +500,43 @@ end

@testset "bhattacharyya / hellinger" begin
for T in (Int, Float64, F64)
x, y = T.([4, 5, 6, 7]), T.([3, 9, 8, 1])
a = T.([1, 2, 1, 3, 2, 1])
b = T.([1, 3, 0, 2, 2, 0])
p = T == Int ? rand(0:10, 12) : rand(T, 12)
p[p .< median(p)] .= 0
q = T == Int ? rand(0:10, 12) : rand(T, 12)

# Bhattacharyya and Hellinger distances are defined for discrete
# probability distributions so to calculate the expected values
# we need to normalize vectors.
px = x ./ sum(x)
py = y ./ sum(y)
expected_bc_x_y = sum(sqrt.(px .* py))
for (x, y) in ((x, y), (Iterators.take(x, 12), Iterators.take(y, 12)))
@test Distances.bhattacharyya_coeff(x, y) ≈ expected_bc_x_y
@test bhattacharyya(x, y) ≈ (-log(expected_bc_x_y))
@test hellinger(x, y) ≈ sqrt(1 - expected_bc_x_y)
end
_x, _y = T.([4, 5, 6, 7]), T.([3, 9, 8, 1])
_a = T.([1, 2, 1, 3, 2, 1])
_b = T.([1, 3, 0, 2, 2, 0])
_p = T == Int ? rand(0:10, 12) : rand(T, 12)
_p[_p .< median(_p)] .= 0
_q = T == Int ? rand(0:10, 12) : rand(T, 12)

for (x, y, a, b, p, q) in ((_x, _y, _a, _b, _p, _q), sparsevec.((_x, _y, _a, _b, _p, _q)))
# Bhattacharyya and Hellinger distances are defined for discrete
# probability distributions so to calculate the expected values
# we need to normalize vectors.
px = x ./ sum(x)
py = y ./ sum(y)
expected_bc_x_y = sum(sqrt.(px .* py))
for (x, y) in ((x, y), (Iterators.take(x, 12), Iterators.take(y, 12)))
@test Distances.bhattacharyya_coeff(x, y) ≈ expected_bc_x_y
@test bhattacharyya(x, y) ≈ (-log(expected_bc_x_y))
@test hellinger(x, y) ≈ sqrt(1 - expected_bc_x_y)
end

pa = a ./ sum(a)
pb = b ./ sum(b)
expected_bc_a_b = sum(sqrt.(pa .* pb))
@test Distances.bhattacharyya_coeff(a, b) ≈ expected_bc_a_b
@test bhattacharyya(a, b) ≈ (-log(expected_bc_a_b))
@test hellinger(a, b) ≈ sqrt(1 - expected_bc_a_b)

pp = p ./ sum(p)
pq = q ./ sum(q)
expected_bc_p_q = sum(sqrt.(pp .* pq))
@test Distances.bhattacharyya_coeff(p, q) ≈ expected_bc_p_q
@test bhattacharyya(p, q) ≈ (-log(expected_bc_p_q))
@test hellinger(p, q) ≈ sqrt(1 - expected_bc_p_q)

# Ensure it is semimetric
@test bhattacharyya(x, y) ≈ bhattacharyya(y, x)
pa = a ./ sum(a)
pb = b ./ sum(b)
expected_bc_a_b = sum(sqrt.(pa .* pb))
@test Distances.bhattacharyya_coeff(a, b) ≈ expected_bc_a_b
@test bhattacharyya(a, b) ≈ (-log(expected_bc_a_b))
@test hellinger(a, b) ≈ sqrt(1 - expected_bc_a_b)

pp = p ./ sum(p)
pq = q ./ sum(q)
expected_bc_p_q = sum(sqrt.(pp .* pq))
@test Distances.bhattacharyya_coeff(p, q) ≈ expected_bc_p_q
@test bhattacharyya(p, q) ≈ (-log(expected_bc_p_q))
@test hellinger(p, q) ≈ sqrt(1 - expected_bc_p_q)

# Ensure it is semimetric
@test bhattacharyya(x, y) ≈ bhattacharyya(y, x)
end
end
end #testset

Expand Down Expand Up @@ -763,7 +767,7 @@ end

X = rand(ComplexF64, m, nx)
Y = rand(ComplexF64, m, ny)

test_pairwise(SqEuclidean(), X, Y, Float64)
test_pairwise(Euclidean(), X, Y, Float64)

Expand Down Expand Up @@ -940,6 +944,17 @@ end
@test pairwise(PeriodicEuclidean(p), X, Y, dims=2)[1,2] == 0m
end

@testset "SparseVector, nnz(a) != nnz(b)" begin
for (n, densa, densb) in ((100, .1, .8), (200, .8, .1))
a = sprand(n, densa)
b = sprand(n, densb)
for d in (bhattacharyya, euclidean, sqeuclidean, jaccard, cityblock, totalvariation,
chebyshev, braycurtis, hamming)
@test d(a, b) ≈ d(Vector(a), Vector(b))
end
end
end

#=
@testset "zero allocation colwise!" begin
d = Euclidean()
Expand Down