From 9fb099e2aacbc6bf03d6ffd771da1fe7d293c767 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Sun, 26 Sep 2021 10:47:30 -0400 Subject: [PATCH] tag new version --- Project.toml | 2 +- src/StringDistances.jl | 8 +++++--- src/fuzzywuzzy.jl | 21 ++++++++------------- src/utils.jl | 5 ++--- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/Project.toml b/Project.toml index 7a359c5..ef525b7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StringDistances" uuid = "88034a9c-02f8-509d-84a9-84ec65e18404" -version = "0.11.0" +version = "0.11.1" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 603adef..d4ac067 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -2,10 +2,10 @@ module StringDistances using Distances import StatsAPI: pairwise, pairwise! +# Distances API abstract type StringSemiMetric <: SemiMetric end abstract type StringMetric <: Metric end -(dist::Union{StringSemiMetric, StringMetric})(s1, s2; max_dist = nothing) = dist(s1, s2) - +const StringDistance = Union{StringSemiMetric, StringMetric} function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type) T = typeof(dist("", "")) if (Missing <: s1) | (Missing <: s2) @@ -15,6 +15,9 @@ function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1:: end Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = result_type(dist, typeof(s1), typeof(s2)) + + +(dist::Union{StringSemiMetric, StringMetric})(s1, s2; max_dist = nothing) = dist(s1, s2) include("utils.jl") include("distances/edit.jl") include("distances/qgram.jl") @@ -24,7 +27,6 @@ include("find.jl") include("fuzzywuzzy.jl") -const StringDistance = Union{StringSemiMetric, StringMetric} ############################################################################## ## ## Export diff --git a/src/fuzzywuzzy.jl b/src/fuzzywuzzy.jl index 1e3e2f8..cdeee6c 100755 --- a/src/fuzzywuzzy.jl +++ b/src/fuzzywuzzy.jl @@ -34,15 +34,15 @@ function (dist::Partial)(s1, s2; max_dist = nothing) return out end +# specialized (faster) version for RatcliffObershelp function (dist::Partial{<: Union{RatcliffObershelp, Normalized{RatcliffObershelp}}})(s1, s2; max_dist = nothing) (s1 === missing) | (s2 === missing) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return dist.dist(s1, s2) out = 1.0 - for r in matching_blocks(s1, s2, 1, 1, len1, len2) + for s2_start in matching_blocks(s1, s2, 1, 1, len1, len2) # Make sure the substring of s2 has length len1 - s2_start = r[2] - r[1] + 1 if s2_start < 1 s2_start = 1 elseif s2_start + len1 - 1 > len2 @@ -56,20 +56,16 @@ function (dist::Partial{<: Union{RatcliffObershelp, Normalized{RatcliffObershelp end function matching_blocks(s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer) - x = Set{Tuple{Int, Int, Int}}() + x = Set{Int}() p = zeros(Int, max(end1 - start1, end2 - start2) + 1) matching_blocks!(x, p, s1, s2, start1, start2, end1, end2) end -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, p::Vector{Int}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer) +function matching_blocks!(x::Set{Int}, p::Vector{Int}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer) j1, j2, len = longest_common_pattern!(p, s1, s2, start1, start2, end1, end2) - # exit if there is no common substring len == 0 && return x - # add the info of the common to the existing set - push!(x, (j1, j2, len)) - # add the longest common substring that happens before + push!(x, j2 - j1 + 1) matching_blocks!(x, p, s1, s2, start1, start2, j1 - 1, j2 - 1) - # add the longest common substring that happens after matching_blocks!(x, p, s1, s2, j1 + len, j2 + len, end1, end2) return x end @@ -137,10 +133,9 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract s1 = join(v1, " ") s2 = join(v2, " ") isempty(s0) && return dist.dist(s1, s2; max_dist = max_dist) - out_01 = dist.dist(s0, s1; max_dist = max_dist) - out_02 = dist.dist(s0, s2; max_dist = max_dist) - out_12 = dist.dist(s1, s2; max_dist = max_dist) - min(out_01, out_02, out_12) + min(dist.dist(s0, s1; max_dist = max_dist), + dist.dist(s0, s2; max_dist = max_dist), + dist.dist(s1, s2; max_dist = max_dist)) end Normalized(dist::TokenSet) = Normalized{typeof(TokenSet(Normalized(dist.dist)))}(TokenSet(Normalized(dist.dist))) diff --git a/src/utils.jl b/src/utils.jl index e8cf27b..d359ab8 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -26,12 +26,11 @@ string_with_length(s::AbstractString) = StringWithLength(s, length(s)) # Not really needed but avoid multi-encapsulation string_with_length(s::StringWithLength) = s Base.length(s::StringWithLength) = s.l -Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i) +Base.iterate(s::StringWithLength) = iterate(s.s) +Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i) Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n) Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s) Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i) - - function reorder(s1::AbstractString, s2::AbstractString) s1 = string_with_length(s1) s2 = string_with_length(s2)