Skip to content

Commit

Permalink
Improved performance for missing words; consistent handling of wildca…
Browse files Browse the repository at this point in the history
…rd matching
  • Loading branch information
zgornel committed Nov 6, 2018
1 parent b508bf1 commit ddfcd18
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 46 deletions.
49 changes: 25 additions & 24 deletions src/document_embeddings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@ end
"""
Retrieves the embedding matrix for a given `document`.
"""
function embed_document(conceptnet::ConceptNet,
function embed_document(conceptnet::ConceptNet{L,K,V},
document::AbstractString;
language=Languages.English(),
keep_size::Bool=true,
compound_word_separator::String="_",
max_compound_word_length::Int=1,
wildcard_matching::Bool=false,
print_matched_words::Bool=false)
print_matched_words::Bool=false
) where {L<:Language, K, V}
# Split document into tokens and embed
return embed_document(conceptnet,
tokenize_for_conceptnet(document),
Expand All @@ -36,15 +37,15 @@ function embed_document(conceptnet::ConceptNet,
print_matched_words=print_matched_words)
end

function embed_document(conceptnet::ConceptNet,
function embed_document(conceptnet::ConceptNet{L,K,V},
document_tokens::Vector{S};
language=Languages.English(),
keep_size::Bool=true,
compound_word_separator::String="_",
max_compound_word_length::Int=1,
wildcard_matching::Bool=false,
print_matched_words::Bool=false
) where S<:AbstractString
) where {L<:Language, K, V, S<:AbstractString}
# Initializations
embeddings = conceptnet.embeddings[language]
# Get positions of words that can be used for indexing (found)
Expand All @@ -67,18 +68,26 @@ function embed_document(conceptnet::ConceptNet,
# Get best matches for not found words
not_found_positions = setdiff(1:length(document_tokens),
collect.(found_positions)...)
words_not_found = document_tokens[not_found_positions]
if keep_size
for word in words_not_found
push!(found_words, word) # the zero-vectors will be the
end # last columns of the document matrix
end
# Return
if print_matched_words
words_not_found = document_tokens[not_found_positions]
println("Embedded words: $found_words")
println("Mismatched words: $words_not_found")
end
return conceptnet[language, found_words], not_found_positions
default = zeros(eltype(V), conceptnet.width)
_embdoc = get(conceptnet.embeddings[language],
found_words,
default,
conceptnet.fuzzy_words[language],
n=conceptnet.width,
wildcard_matching=wildcard_matching)
if keep_size
embedded_document = hcat(_embdoc, zeros(eltype(V), conceptnet.width,
length(not_found_positions)))
else
embedded_document = _embdoc
end
return embedded_document, not_found_positions
end


Expand All @@ -96,6 +105,8 @@ function make_word_from_tokens(tokens::Vector{S},
end
end



# Function that searches subphrases (continuous token combinations)
# from a document in a the embedded words and returns the positions of matched
# subphrases/words
Expand Down Expand Up @@ -124,26 +135,16 @@ function token_search(conceptnet::ConceptNet{L,K,V},
wildcard_matching::Bool=false) where
{L<:Language, K, V, S<:AbstractString}
# Initializations
if wildcard_matching
# Build function that checks whether a token is found in conceptnet
# using/or not wildcard matching
check_function = (conceptnet, language, token, default)->
!isempty(get(conceptnet[language], # get from interface.jl
token,
default,
conceptnet.fuzzy_words[language]))
else
check_function = (conceptnet, language, token, default)->
haskey(conceptnet[language], token)
end
found = Vector{UnitRange{Int}}()
n = length(tokens)
i = 1
j = n
while i <= n
if j-i+1 <= max_length
token = join(tokens[i:j], separator, separator)
is_match = check_function(conceptnet, language, token, V())
is_match = !isempty(get(conceptnet[language], token, V(),
conceptnet.fuzzy_words[language],
wildcard_matching=wildcard_matching))
if is_match
push!(found, i:j)
i = j + 1
Expand Down
63 changes: 41 additions & 22 deletions src/interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,43 +37,60 @@ show(io::IO, conceptnet::ConceptNetEnglish) =
# Example: the embedding corresponding to "###_something" is returned for any search query
# of two words where the first word in made out out 3 letters followed by
# the word 'something'
function get(embeddings::Dict{K,V}, keyword::K, default::V, fuzzy_words::Vector{K}) where
function get(embeddings::Dict{K,V},
keyword::K,
default::V,
fuzzy_words::Vector{K};
wildcard_matching::Bool=true) where
{K<:AbstractString, V<:AbstractVector}
if haskey(embeddings, keyword)
# The keyword exists in the dictionary
return embeddings[keyword]
else
# The keyword is not found; try fuzzy matching
ω = 0.4 # weight assinged to matching a #, 1-w weight assigned to a matching letter
L = length(keyword)
matches = (word for word in fuzzy_words
if length(word) == L &&
occursin(Regex(replace(word,"#"=>".")), keyword))
if isempty(matches)
return default
else
best_match = ""
max_score = 0
for match in matches
l = length(replace(match,"#"=>"")) # number of letters matched
score = ω*(L-l)/L + (1-ω)*l/L
if score > max_score
best_match = match
max_score = score
if wildcard_matching
# The keyword is not found; try fuzzy matching
ω = 0.4 # weight assinged to matching a #, 1-w weight assigned to a matching letter
L = length(keyword)
matches = (word for word in fuzzy_words
if length(word) == L &&
occursin(Regex(replace(word,"#"=>".")), keyword))
if isempty(matches)
return default
else
best_match = ""
max_score = 0
for match in matches
l = length(replace(match,"#"=>"")) # number of letters matched
score = ω*(L-l)/L + (1-ω)*l/L
if score > max_score
best_match = match
max_score = score
end
end
return embeddings[best_match]
end
return embeddings[best_match]
else
# The keyword is not found; no fuzzy matching
return default
end
end
end

function get(embeddings::Dict{K,V}, keywords::AbstractVector{K}, default::V, fuzzy_words::Vector{K};
function get(embeddings::Dict{K,V},
keywords::AbstractVector{K},
default::V,
fuzzy_words::Vector{K};
wildcard_matching::Bool=true,
n::Int=0) where
{K<:AbstractString, V<:AbstractVector}
p = length(keywords)
keywords_embedded = Matrix{eltype(V)}(undef, n, p)
for i in 1:p
keywords_embedded[:,i] = get(embeddings, keywords[i], default, fuzzy_words)
keywords_embedded[:,i] = get(embeddings,
keywords[i],
default,
fuzzy_words,
wildcard_matching=wildcard_matching)
end
return keywords_embedded
end
Expand All @@ -89,6 +106,7 @@ getindex(conceptnet::ConceptNet{L,K,V}, language::L, words::S) where
words,
zeros(eltype(V), conceptnet.width),
conceptnet.fuzzy_words[language],
wildcard_matching=true,
n=conceptnet.width)

# Generic indexing, multiple words
Expand All @@ -104,7 +122,8 @@ getindex(conceptnet::ConceptNet{L,K,V}, language::L, word::S) where
get(conceptnet.embeddings[language],
word,
zeros(eltype(V), conceptnet.width),
conceptnet.fuzzy_words[language])
conceptnet.fuzzy_words[language],
wildcard_matching=true)

# Generic indexing, single word
# Example: julia> conceptnet[:en, "word"]
Expand Down

0 comments on commit ddfcd18

Please sign in to comment.