From 8774aac1a2e2f31dbcd5af44909bff9cace5ea99 Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Fri, 19 Oct 2018 14:36:33 +0200 Subject: [PATCH 1/8] Dependency updates --- Manifest.toml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 9ef3c4b..fc77a98 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -9,9 +9,9 @@ version = "0.8.10" [[BinaryProvider]] deps = ["Libdl", "Pkg", "SHA", "Test"] -git-tree-sha1 = "48c147e63431adbcee69bc40b04c3f0fec0a4982" +git-tree-sha1 = "9930c1a6cd49d9fcd7218df6be417e6ae4f1468a" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.0" +version = "0.5.2" [[Blosc]] deps = ["BinaryProvider", "CMakeWrapper", "Compat", "Libdl"] @@ -27,9 +27,9 @@ version = "1.0.0" [[CMake]] deps = ["BinDeps", "Libdl", "Pkg", "Test"] -git-tree-sha1 = "4f0b34e12d4d2c6a367d62c73c961ad468d62b7b" +git-tree-sha1 = "74853a75c26a4a73ac391ee26ee29ebeb5583d9f" uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a" -version = "1.0.2" +version = "1.1.0" [[CMakeWrapper]] deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Pkg", "Test"] @@ -45,9 +45,9 @@ version = "0.5.0" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "ff2595695fc4f14427358ce2593f867085c45dcb" +git-tree-sha1 = "2d9e14d19bad3f9ad5cc5e4cffabc3cfa59de825" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "1.2.0" +version = "1.3.0" [[Dates]] deps = ["Printf"] @@ -91,9 +91,9 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" [[IterTools]] deps = ["Pkg", "SparseArrays", "Test"] -git-tree-sha1 = "ed0787e62dc46b8d8c7c3db54391d71e0da5fefd" +git-tree-sha1 = "79246285c43602384e6f1943b3554042a3712056" uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" -version = "1.0.0" +version = "1.1.1" [[JSON]] deps = ["Dates", "Distributed", "Mmap", "Pkg", "Sockets", "Test", "Unicode"] From bc0250685744e533c3328ff16520a030971b6ff2 Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Fri, 19 Oct 2018 14:51:07 +0200 Subject: [PATCH 2/8] Added constants to defaults.jl --- src/ConceptnetNumberbatch.jl | 53 +----------------------------------- src/defaults.jl | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 52 deletions(-) create mode 100644 src/defaults.jl diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl index 15566de..85e1f4d 100644 --- a/src/ConceptnetNumberbatch.jl +++ b/src/ConceptnetNumberbatch.jl @@ -29,58 +29,6 @@ using NearestNeighbors import Base: get, getindex, size, length, show, keys, values, in -# Links pointing to the latest ConceptNetNumberbatch version (v"17.06") -const CONCEPTNET_MULTI_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz" -const CONCEPTNET_EN_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz" -const CONCEPTNET_HDF5_LINK = "https://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5" - -# Accepted languages (map from conceptnet to Languages.Language) -const LANGUAGES = Dict(:en=>Languages.English(), - :fr=>Languages.French(), - :de=>Languages.German(), - :it=>Languages.Italian(), - :fi=>Languages.Finnish(), - :nl=>Languages.Dutch(), - :af=>Languages.Dutch(), - :pt=>Languages.Portuguese(), - :es=>Languages.Spanish(), - :ru=>Languages.Russian(), - :sh=>Languages.Serbian(),# and Languages.Croatian() - :sw=>Languages.Swedish(), - :cs=>Languages.Czech(), - :pl=>Languages.Polish(), - :bg=>Languages.Bulgarian(), - :eo=>Languages.Esperanto(), - :hu=>Languages.Hungarian(), - :el=>Languages.Greek(), - :no=>Languages.Nynorsk(), - :sl=>Languages.Slovene(), - :ro=>Languages.Romanian(), - :vi=>Languages.Vietnamese(), - :lv=>Languages.Latvian(), - :tr=>Languages.Turkish(), - :da=>Languages.Danish(), - :ar=>Languages.Arabic(), - :fa=>Languages.Persian(), - :ko=>Languages.Korean(), - :th=>Languages.Thai(), - :ka=>Languages.Georgian(), - :he=>Languages.Hebrew(), - :te=>Languages.Telugu(), - :et=>Languages.Estonian(), - :hi=>Languages.Hindi(), - :lt=>Languages.Lithuanian(), - :uk=>Languages.Ukrainian(), - :be=>Languages.Belarusian(), - :sw=>Languages.Swahili(), - :ur=>Languages.Urdu(), - :ku=>Languages.Kurdish(), - :az=>Languages.Azerbaijani(), - :ta=>Languages.Tamil() - # add more mappings here if needed - # AND supported by Languages.jl - ) - export CONCEPTNET_MULTI_LINK, CONCEPTNET_EN_LINK, CONCEPTNET_HDF5_LINK, @@ -89,6 +37,7 @@ export CONCEPTNET_MULTI_LINK, load_embeddings, phrase_embeddings +include("defaults.jl") include("interface.jl") include("files.jl") include("search.jl") diff --git a/src/defaults.jl b/src/defaults.jl new file mode 100644 index 0000000..0d3c14d --- /dev/null +++ b/src/defaults.jl @@ -0,0 +1,51 @@ +# Links pointing to the latest ConceptNetNumberbatch version (v"17.06") +const CONCEPTNET_MULTI_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz" +const CONCEPTNET_EN_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz" +const CONCEPTNET_HDF5_LINK = "https://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5" + +# Accepted languages (map from conceptnet to Languages.Language) +const LANGUAGES = Dict(:en=>Languages.English(), + :fr=>Languages.French(), + :de=>Languages.German(), + :it=>Languages.Italian(), + :fi=>Languages.Finnish(), + :nl=>Languages.Dutch(), + :af=>Languages.Dutch(), + :pt=>Languages.Portuguese(), + :es=>Languages.Spanish(), + :ru=>Languages.Russian(), + :sh=>Languages.Serbian(),# and Languages.Croatian() + :sw=>Languages.Swedish(), + :cs=>Languages.Czech(), + :pl=>Languages.Polish(), + :bg=>Languages.Bulgarian(), + :eo=>Languages.Esperanto(), + :hu=>Languages.Hungarian(), + :el=>Languages.Greek(), + :no=>Languages.Nynorsk(), + :sl=>Languages.Slovene(), + :ro=>Languages.Romanian(), + :vi=>Languages.Vietnamese(), + :lv=>Languages.Latvian(), + :tr=>Languages.Turkish(), + :da=>Languages.Danish(), + :ar=>Languages.Arabic(), + :fa=>Languages.Persian(), + :ko=>Languages.Korean(), + :th=>Languages.Thai(), + :ka=>Languages.Georgian(), + :he=>Languages.Hebrew(), + :te=>Languages.Telugu(), + :et=>Languages.Estonian(), + :hi=>Languages.Hindi(), + :lt=>Languages.Lithuanian(), + :uk=>Languages.Ukrainian(), + :be=>Languages.Belarusian(), + :sw=>Languages.Swahili(), + :ur=>Languages.Urdu(), + :ku=>Languages.Kurdish(), + :az=>Languages.Azerbaijani(), + :ta=>Languages.Tamil() + # add more mappings here if needed + # AND supported by Languages.jl + ) From 6c24b4fd7c0fd97d55e21f68f820651f5fd3845c Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Fri, 19 Oct 2018 19:09:44 +0200 Subject: [PATCH 3/8] Improved speed of phrase embeddings --- src/interface.jl | 26 +++++++++++++--------- src/search.jl | 58 +++++++++++++++++++++++++++--------------------- 2 files changed, 49 insertions(+), 35 deletions(-) diff --git a/src/interface.jl b/src/interface.jl index 79aa5d8..fda2637 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -38,9 +38,9 @@ show(io::IO, conceptnet::ConceptNetEnglish) = # Example: the embedding corresponding to "###_something" is returned for any search query # of two words where the first word in made out out 3 letters followed by # the word 'something' -function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K}) where {K<:AbstractString, V<:AbstractVector} - words = keys(embeddings) - if keyword in words +function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K}) where + {K<:AbstractString, V<:AbstractVector} + if haskey(embeddings, keyword) # The keyword exists in the dictionary return embeddings[keyword] else @@ -73,14 +73,20 @@ end # Indexing # Generic indexing, multiple words # Example: julia> conceptnet[Languages.English(), ["another", "word"]) +# TODO(Make type stable!); make new get for keyword vectors getindex(conceptnet::ConceptNet{L,K,V}, language::L, words::S) where - {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} = - hcat((get(conceptnet.embeddings[language], - word, - zeros(eltype(V), conceptnet.width), - conceptnet.fuzzy_words[language]) - for word in words)... - )::Matrix{eltype(V)} + {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} = begin + if !isempty(words) + hcat((get(conceptnet.embeddings[language], + word, + zeros(eltype(V), conceptnet.width), + conceptnet.fuzzy_words[language]) + for word in words)... + )::Matrix{eltype(V)} + else + Vector{eltype(V)}() + end +end # Generic indexing, multiple words # Example: julia> conceptnet[:en, ["another", "word"]] diff --git a/src/search.jl b/src/search.jl index c769a6b..4b5ccff 100644 --- a/src/search.jl +++ b/src/search.jl @@ -11,38 +11,46 @@ function phrase_embeddings(conceptnet::ConceptNet, distance=Levenshtein()) # Initializations sep = "_" - tokens = split(phrase) - dictionary = collect(keys(conceptnet.embeddings[language])) + phrase_tokens = strip.(split(phrase)) + embeddings = conceptnet.embeddings[language] # Generate positions of words that can be used for indexing (found) # and that can be searched (not_found) - found = token_search(tokens, - dictionary, + found = token_search(phrase_tokens, + embeddings, sep=sep, max_length=max_compound_word_length) - not_found = setdiff(1:length(tokens), found...) # Get found words words = Vector{String}() for pos in found - word = make_word_from_tokens(tokens, pos, sep, sep) + word = make_word_from_tokens(phrase_tokens, pos, sep, sep) push!(words, word) end # Get best matches for not found words - for pos in not_found - word = make_word_from_tokens(tokens, pos, sep, sep) - if search_mismatches == :no - # Insert not found words if exact matches are to be - # returned only if a matrix of width equal to the - # number of terms is to be returned - keep_size && push!(words, word) - elseif search_mismatches == :brute_force - matcher = dict_word->evaluate(distance, word, dict_word) - _, match_pos = findmin(map(matcher, dictionary)) - push!(words, dictionary[match_pos]) - else - @warn "The only supported approximate string matching" * - " method is :brute_force. Use :no for skipping the" * - " search; will not search." - push!(words, word) + words_not_found = setdiff(phrase_tokens, words) + if keep_size && !isempty(words_not_found) # keep_size has precendence + for word in words_not_found + if search_mismatches == :no + # Insert not found words if exact matches are to be + # returned only if a matrix of width equal to the + # number of terms is to be returned + push!(words, word) + elseif search_mismatches == :brute_force + match_word = "" + distmin = Inf + for dict_word in keys(embeddings) + dist = evaluate(distance, word, dict_word) + if dist < distmin + distmin = dist + match_word = dict_word + end + end + push!(words, match_word) + else + @warn "The only supported approximate string matching" * + " method is :brute_force. Use :no for skipping the" * + " search; will not search." + push!(words, word) + end end end # Return @@ -62,7 +70,7 @@ function make_word_from_tokens(tokens, pos, sep, sep_end) end # Function that searches subphrases (continuous token combinations) -# from a phrase in a dictionary and returns the positions of matched +# from a phrase in a the embedded words and returns the positions of matched # subphrases/words # Example: # - for a vector: String[a, simpler, world, would, be, more, complicated], @@ -81,14 +89,14 @@ end # ... # more_complicated, # complicated] -function token_search(tokens, dictionary; sep::String="_", max_length::Int=3) +function token_search(tokens, embeddings; sep::String="_", max_length::Int=3) found = Vector{UnitRange{Int}}() n = length(tokens) i = 1 j = n while i <= n token = join(tokens[i:j], sep, sep) - if token in dictionary && j-i+1 <= max_length + if haskey(embeddings, token) && j-i+1 <= max_length push!(found, i:j) i = j + 1 j = n From 32620824535c65c83f3d746431bf2d93cdb1845c Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Fri, 19 Oct 2018 20:43:22 +0200 Subject: [PATCH 4/8] Improved type stability --- src/interface.jl | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/interface.jl b/src/interface.jl index fda2637..c89cf0f 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -38,7 +38,7 @@ show(io::IO, conceptnet::ConceptNetEnglish) = # Example: the embedding corresponding to "###_something" is returned for any search query # of two words where the first word in made out out 3 letters followed by # the word 'something' -function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K}) where +function get(embeddings::Dict{K,V}, keyword::K, default::V, fuzzy_words::Vector{K}) where {K<:AbstractString, V<:AbstractVector} if haskey(embeddings, keyword) # The keyword exists in the dictionary @@ -68,25 +68,27 @@ function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K}) end end - +function get(embeddings::Dict{K,V}, keywords::AbstractVector{K}, default::V, fuzzy_words::Vector{K}; + n::Int=0) where + {K<:AbstractString, V<:AbstractVector} + p = length(keywords) + keywords_embedded = Matrix{eltype(V)}(undef, n, p) + for i in 1:p + keywords_embedded[:,i] = get(embeddings, keywords[i], default, fuzzy_words) + end + return keywords_embedded +end # Indexing # Generic indexing, multiple words # Example: julia> conceptnet[Languages.English(), ["another", "word"]) -# TODO(Make type stable!); make new get for keyword vectors getindex(conceptnet::ConceptNet{L,K,V}, language::L, words::S) where - {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} = begin - if !isempty(words) - hcat((get(conceptnet.embeddings[language], - word, - zeros(eltype(V), conceptnet.width), - conceptnet.fuzzy_words[language]) - for word in words)... - )::Matrix{eltype(V)} - else - Vector{eltype(V)}() - end -end + {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} = + get(conceptnet.embeddings[language], + words, + zeros(eltype(V), conceptnet.width), + conceptnet.fuzzy_words[language], + n=conceptnet.width) # Generic indexing, multiple words # Example: julia> conceptnet[:en, ["another", "word"]] @@ -98,13 +100,16 @@ getindex(conceptnet::ConceptNet{L,K,V}, language::Symbol, words::S) where # Example: julia> conceptnet[Languages.English(), "word"] getindex(conceptnet::ConceptNet{L,K,V}, language::L, word::S) where {L<:Language, K, V, S<:AbstractString} = - conceptnet[language, [word]] + get(conceptnet.embeddings[language], + word, + zeros(eltype(V), conceptnet.width), + conceptnet.fuzzy_words[language]) # Generic indexing, single word # Example: julia> conceptnet[:en, "word"] getindex(conceptnet::ConceptNet{L,K,V}, language::Symbol, word::S) where {L<:Language, K, V, S<:AbstractString} = - conceptnet[LANGUAGES[language], [word]] + conceptnet[LANGUAGES[language], word] # Single-language indexing: conceptnet[["another", "word"]], if language==Languages.English() getindex(conceptnet::ConceptNet{L,K,V}, words::S) where @@ -114,7 +119,7 @@ getindex(conceptnet::ConceptNet{L,K,V}, words::S) where # Single-language indexing: conceptnet["word"], if language==Languages.English() getindex(conceptnet::ConceptNet{L,K,V}, word::S) where {L<:Languages.Language, K, V, S<:AbstractString} = - conceptnet[L(), [word]] + conceptnet[L(), word] # Index by language (returns a Dict{word=>embedding}) getindex(conceptnet::ConceptNet, language::L) where {L<:Languages.Language} = From 080cf657c0f88282165bdfb52ea34707e96f9830 Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Mon, 22 Oct 2018 11:24:19 +0200 Subject: [PATCH 5/8] Many changes: refactored document embedding and added tests, improved tokenization, deactivated word models --- Manifest.toml | 12 ------- Project.toml | 1 - REQUIRE | 1 - src/ConceptnetNumberbatch.jl | 5 ++- src/defaults.jl | 3 ++ src/{search.jl => document_embeddings.jl} | 42 ++++++++++++++-------- src/files.jl | 26 ++++++++------ src/interface.jl | 3 +- src/word_model.jl | 7 ++++ test/runtests.jl | 44 +++++++++++++++++++++++ 10 files changed, 101 insertions(+), 43 deletions(-) rename src/{search.jl => document_embeddings.jl} (73%) diff --git a/Manifest.toml b/Manifest.toml index fc77a98..57b8b67 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -145,12 +145,6 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" -[[NearestNeighbors]] -deps = ["Distances", "LinearAlgebra", "Mmap", "Pkg", "StaticArrays", "Test"] -git-tree-sha1 = "aab46b96ae5c2a9c08146188016d6312276094e5" -uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce" -version = "0.4.2" - [[OrderedCollections]] deps = ["Pkg", "Random", "Serialization", "Test"] git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b" @@ -196,12 +190,6 @@ uuid = "6462fe0b-24de-5631-8697-dd941f90decc" deps = ["LinearAlgebra", "Random"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[StaticArrays]] -deps = ["InteractiveUtils", "LinearAlgebra", "Random", "Statistics", "Test"] -git-tree-sha1 = "d432c79bef174a830304f8601427a4357dfdbfb7" -uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "0.8.3" - [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/Project.toml b/Project.toml index 8b632f1..88040fe 100644 --- a/Project.toml +++ b/Project.toml @@ -7,7 +7,6 @@ version = "0.1.0" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" -NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" diff --git a/REQUIRE b/REQUIRE index a3ddeca..4b31580 100644 --- a/REQUIRE +++ b/REQUIRE @@ -4,4 +4,3 @@ CodecZlib HDF5 Languages StringDistances -NearestNeighbors diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl index 85e1f4d..9bb2ee4 100644 --- a/src/ConceptnetNumberbatch.jl +++ b/src/ConceptnetNumberbatch.jl @@ -25,7 +25,6 @@ using CodecZlib using HDF5 using Languages using StringDistances -using NearestNeighbors import Base: get, getindex, size, length, show, keys, values, in @@ -35,11 +34,11 @@ export CONCEPTNET_MULTI_LINK, ConceptNet, download_embeddings, load_embeddings, - phrase_embeddings + embed_document include("defaults.jl") include("interface.jl") include("files.jl") -include("search.jl") +include("document_embeddings.jl") end # module diff --git a/src/defaults.jl b/src/defaults.jl index 0d3c14d..5be2f29 100644 --- a/src/defaults.jl +++ b/src/defaults.jl @@ -49,3 +49,6 @@ const LANGUAGES = Dict(:en=>Languages.English(), # add more mappings here if needed # AND supported by Languages.jl ) + +# Regular expression on which to split text into tokens +const DEFAULT_SPLITTER = r"(,|:|\\|\/|;|\.|\[|\]|\{|\}|\"|\"|\s+)" diff --git a/src/search.jl b/src/document_embeddings.jl similarity index 73% rename from src/search.jl rename to src/document_embeddings.jl index 4b5ccff..6c811dd 100644 --- a/src/search.jl +++ b/src/document_embeddings.jl @@ -1,32 +1,45 @@ """ -Retrieves the embedding matrix for a given `phrase`. +Fast tokenization function. """ -function phrase_embeddings(conceptnet::ConceptNet, - phrase::S where S<:AbstractString; - language=Languages.English(), - keep_size::Bool=true, - max_compound_word_length::Int=1, - search_mismatches::Symbol=:no, - show_words::Bool=true, - distance=Levenshtein()) +function custom_tokenize(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER) + # First, split + tokens = strip.(split(doc, splitter)) + # Filter out empty strings + filter!(!isempty, tokens) +end + + + +""" +Retrieves the embedding matrix for a given `document`. +""" +function embed_document(conceptnet::ConceptNet, + document::S where S<:AbstractString; + language=Languages.English(), + keep_size::Bool=true, + max_compound_word_length::Int=1, + search_mismatches::Symbol=:no, + show_words::Bool=true, + distance=Levenshtein()) # Initializations sep = "_" - phrase_tokens = strip.(split(phrase)) embeddings = conceptnet.embeddings[language] + # Split into tokens + document_tokens = custom_tokenize(document) # Generate positions of words that can be used for indexing (found) # and that can be searched (not_found) - found = token_search(phrase_tokens, + found = token_search(document_tokens, embeddings, sep=sep, max_length=max_compound_word_length) # Get found words words = Vector{String}() for pos in found - word = make_word_from_tokens(phrase_tokens, pos, sep, sep) + word = make_word_from_tokens(document_tokens, pos, sep, sep) push!(words, word) end # Get best matches for not found words - words_not_found = setdiff(phrase_tokens, words) + words_not_found = setdiff(document_tokens, words) if keep_size && !isempty(words_not_found) # keep_size has precendence for word in words_not_found if search_mismatches == :no @@ -70,7 +83,7 @@ function make_word_from_tokens(tokens, pos, sep, sep_end) end # Function that searches subphrases (continuous token combinations) -# from a phrase in a the embedded words and returns the positions of matched +# from a document in a the embedded words and returns the positions of matched # subphrases/words # Example: # - for a vector: String[a, simpler, world, would, be, more, complicated], @@ -89,6 +102,7 @@ end # ... # more_complicated, # complicated] +# TODO(Corneliu): Implement wildcard matching as well function token_search(tokens, embeddings; sep::String="_", max_length::Int=3) found = Vector{UnitRange{Int}}() n = length(tokens) diff --git a/src/files.jl b/src/files.jl index f1bf513..cf7024a 100644 --- a/src/files.jl +++ b/src/files.jl @@ -1,6 +1,6 @@ """ -Downloads embeddings given a `url` and saves them to a file -pointed to by `localfile`. +Download ConceptNetNumberbatch embeddings given a `url` and saves them +to a file pointed to by `localfile`. """ function download_embeddings(;url=CONCEPTNET_EN_LINK, localfile=abspath("./_conceptnet_/" * @@ -20,7 +20,7 @@ end """ -Function that loads the embeddings given a valid ConceptNetNumberbatch `filepath`, +Load the embeddings given a valid ConceptNetNumberbatch `filepath`, lading at most `max_vocab_size` embeddings if no specific `keep_words` are specified, filtering on `languages`. """ @@ -63,8 +63,9 @@ function load_embeddings(filepath::AbstractString; end - -# Loads the ConceptNetNumberbatch from a .gz or uncompressed file +""" +Load the ConceptNetNumberbatch embeddings from a .gz or uncompressed file. +""" function _load_gz_embeddings(filepath::S1, decompressor::TranscodingStreams.Codec, max_vocab_size::Union{Nothing,Int}, @@ -118,8 +119,9 @@ function _load_gz_embeddings(filepath::S1, end - -# Loads the ConceptNetNumberbatch from a HDF5 file +""" +Load the ConceptNetNumberbatch embeddings from a HDF5 file. +""" function _load_hdf5_embeddings(filepath::S1, max_vocab_size::Union{Nothing,Int}, keep_words::Vector{S2}; @@ -206,8 +208,9 @@ function process_language_argument(languages::Vector{L}, end - -# Function that calculates how many embeddings to retreive +""" +Calculate how many embeddings to retreive. +""" function _get_vocab_size(real_vocab_size, max_vocab_size=nothing, keep_words=String[]) @@ -231,8 +234,9 @@ function _get_vocab_size(real_vocab_size, end - -# Parse a line +""" +Parse a line of text from a ConceptNetNumberbatch delimited file. +""" function _parseline(buf; word_only=false) bufvec = split(buf, " ") word = string(popfirst!(bufvec)) diff --git a/src/interface.jl b/src/interface.jl index c89cf0f..ae6bddd 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -8,7 +8,6 @@ ConceptNet(embeddings::Dict{K,V}, width::Int) where {K<:AbstractString, V<:AbstractVector} = ConceptNet{Languages.English(), K, V}(embeddings, width, Dict(Languages.English()=>K[])) - # Aliases const ConceptNetMulti{L} = ConceptNet{L, String, Vector{Float64}} const ConceptNetMultiCompressed{L} = ConceptNet{L, String, Vector{Int8}} @@ -79,6 +78,8 @@ function get(embeddings::Dict{K,V}, keywords::AbstractVector{K}, default::V, fuz return keywords_embedded end + + # Indexing # Generic indexing, multiple words # Example: julia> conceptnet[Languages.English(), ["another", "word"]) diff --git a/src/word_model.jl b/src/word_model.jl index d08e8f1..b74faff 100644 --- a/src/word_model.jl +++ b/src/word_model.jl @@ -1,3 +1,8 @@ +# EXPERIMENTAL, not to be used! +# To use, uncomment the code below and add NearestNeighbors. + +#= + # ngram NN-model for faster approximate string matching # So far it is not really workable, results are bad # An example can be found in "./scripts/test_word_model.jl" @@ -70,3 +75,5 @@ function get_ngrams(words::Vector{S}, n::Int=2) where S<:AbstractString end return unique(ngrams) end + +=# diff --git a/test/runtests.jl b/test/runtests.jl index 7e6f267..d9cf63a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -109,6 +109,50 @@ end end end +@testset "Document Embedding" begin + filepath = joinpath(string(@__DIR__), "data", "_test_file_en.txt.gz") + conceptnet = load_embeddings(filepath, languages=[Languages.English()]) + # Document with no matchable words + doc = "a aaaaa b" + embedded_doc = embed_document(conceptnet, doc, keep_size=false, + max_compound_word_length=1, + search_mismatches=:no, + show_words=false) + @test embedded_doc isa Matrix{Float64} + @test isempty(embedded_doc) + embedded_doc = embed_document(conceptnet, doc, keep_size=true, + max_compound_word_length=1, + search_mismatches=:no, + show_words=false) + @test embedded_doc isa Matrix{Float64} + @test size(embedded_doc, 2) == length( + ConceptnetNumberbatch.custom_tokenize(doc)) + # Document with all words matchable + doc_2 = "Five words: huge adapter, xxxyyyzish, 2342 metres ." + embedded_doc_2 = embed_document(conceptnet, doc_2, keep_size=false, + max_compound_word_length=2, + search_mismatches=:no, + show_words=false) + @test embedded_doc_2 isa Matrix{Float64} + @test isempty(embedded_doc_2) # no exact matches (wildcard matching not supported yet) + embedded_doc_2 = embed_document(conceptnet, doc_2, keep_size=true, + max_compound_word_length=2, + search_mismatches=:no, + show_words=false) + @test embedded_doc_2 isa Matrix{Float64} + @test size(embedded_doc_2, 2) == length( + ConceptnetNumberbatch.custom_tokenize(doc_2)) + zero_cols = [4, 7] # zero columns + for i in size(embedded_doc_2, 2) + embedding = embedded_doc_2[:,i] + if i in zero_cols + @test all(iszero, embedding) + else + @test any(iszero, embedding) + end + end +end + # show methods @testset "Show methods" begin buf = IOBuffer() From caa19898315d0fdc5de2697c8338d53f8076f45d Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Mon, 22 Oct 2018 12:35:47 +0200 Subject: [PATCH 6/8] Added new method for document embedding --- src/document_embeddings.jl | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/document_embeddings.jl b/src/document_embeddings.jl index 6c811dd..930d15b 100644 --- a/src/document_embeddings.jl +++ b/src/document_embeddings.jl @@ -14,18 +14,35 @@ end Retrieves the embedding matrix for a given `document`. """ function embed_document(conceptnet::ConceptNet, - document::S where S<:AbstractString; + document::AbstractString; language=Languages.English(), keep_size::Bool=true, max_compound_word_length::Int=1, search_mismatches::Symbol=:no, show_words::Bool=true, distance=Levenshtein()) + # Split document into tokens and embed + return embed_document(conceptnet, + custom_tokenize(document), + language=language, + keep_size=keep_size, + max_compound_word_length=max_compound_word_length, + search_mismatches=search_mismatches, + show_words=show_words, + distance=distance) +end + +function embed_document(conceptnet::ConceptNet, + document_tokens::Vector{S}; + language=Languages.English(), + keep_size::Bool=true, + max_compound_word_length::Int=1, + search_mismatches::Symbol=:no, + show_words::Bool=true, + distance=Levenshtein()) where S<:AbstractString # Initializations sep = "_" embeddings = conceptnet.embeddings[language] - # Split into tokens - document_tokens = custom_tokenize(document) # Generate positions of words that can be used for indexing (found) # and that can be searched (not_found) found = token_search(document_tokens, From 930e7c731bef6d380bddac8b25139c30327b2d2c Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Tue, 23 Oct 2018 14:10:25 +0200 Subject: [PATCH 7/8] Improved document embedding function --- src/ConceptnetNumberbatch.jl | 1 + src/document_embeddings.jl | 96 +++++++++++++++++++++++++----------- test/runtests.jl | 67 +++++++++++++------------ 3 files changed, 104 insertions(+), 60 deletions(-) diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl index 9bb2ee4..a80651c 100644 --- a/src/ConceptnetNumberbatch.jl +++ b/src/ConceptnetNumberbatch.jl @@ -34,6 +34,7 @@ export CONCEPTNET_MULTI_LINK, ConceptNet, download_embeddings, load_embeddings, + tokenize_for_conceptnet, embed_document include("defaults.jl") diff --git a/src/document_embeddings.jl b/src/document_embeddings.jl index 930d15b..0084fc0 100644 --- a/src/document_embeddings.jl +++ b/src/document_embeddings.jl @@ -1,7 +1,7 @@ """ Fast tokenization function. """ -function custom_tokenize(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER) +function tokenize_for_conceptnet(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER) # First, split tokens = strip.(split(doc, splitter)) # Filter out empty strings @@ -17,18 +17,22 @@ function embed_document(conceptnet::ConceptNet, document::AbstractString; language=Languages.English(), keep_size::Bool=true, + compound_word_separator::String="_", max_compound_word_length::Int=1, + wildcard_matching::Bool=false, search_mismatches::Symbol=:no, - show_words::Bool=true, + print_matched_words::Bool=false, distance=Levenshtein()) # Split document into tokens and embed return embed_document(conceptnet, - custom_tokenize(document), + tokenize_for_conceptnet(document), language=language, keep_size=keep_size, + compound_word_separator=compound_word_separator, max_compound_word_length=max_compound_word_length, + wildcard_matching=wildcard_matching, search_mismatches=search_mismatches, - show_words=show_words, + print_matched_words=print_matched_words, distance=distance) end @@ -36,34 +40,42 @@ function embed_document(conceptnet::ConceptNet, document_tokens::Vector{S}; language=Languages.English(), keep_size::Bool=true, + compound_word_separator::String="_", max_compound_word_length::Int=1, + wildcard_matching::Bool=false, search_mismatches::Symbol=:no, - show_words::Bool=true, + print_matched_words::Bool=false, distance=Levenshtein()) where S<:AbstractString # Initializations - sep = "_" embeddings = conceptnet.embeddings[language] - # Generate positions of words that can be used for indexing (found) - # and that can be searched (not_found) - found = token_search(document_tokens, - embeddings, - sep=sep, - max_length=max_compound_word_length) + # Get positions of words that can be used for indexing (found) + # and those of words that can be searched (not_found) + found_positions = token_search(conceptnet, + document_tokens; + language=language, + separator=compound_word_separator, + max_length=max_compound_word_length, + wildcard_matching=wildcard_matching) # Get found words - words = Vector{String}() - for pos in found - word = make_word_from_tokens(document_tokens, pos, sep, sep) - push!(words, word) + found_words = Vector{String}() + for pos in found_positions + word = make_word_from_tokens(document_tokens, + pos, + separator=compound_word_separator, + separator_last=compound_word_separator) + push!(found_words, word) end # Get best matches for not found words - words_not_found = setdiff(document_tokens, words) + not_found_positions = setdiff(1:length(document_tokens), + collect.(found_positions)...) + words_not_found = document_tokens[not_found_positions] if keep_size && !isempty(words_not_found) # keep_size has precendence for word in words_not_found if search_mismatches == :no # Insert not found words if exact matches are to be # returned only if a matrix of width equal to the # number of terms is to be returned - push!(words, word) + push!(found_words, word) elseif search_mismatches == :brute_force match_word = "" distmin = Inf @@ -74,28 +86,35 @@ function embed_document(conceptnet::ConceptNet, match_word = dict_word end end - push!(words, match_word) + push!(found_words, match_word) else @warn "The only supported approximate string matching" * " method is :brute_force. Use :no for skipping the" * " search; will not search." - push!(words, word) + push!(found_words, word) end end end # Return - show_words && @show words - return conceptnet[language, words] + if print_matched_words + println("Embedded words: $found_words") + println("Mismatched words: $words_not_found") + end + return conceptnet[language, found_words], not_found_positions end # Small function that builds a compound word -function make_word_from_tokens(tokens, pos, sep, sep_end) +function make_word_from_tokens(tokens::Vector{S}, + pos; + separator::String="_", + separator_last::String="_") where + S<:AbstractString if length(pos) == 1 return join(tokens[pos]) else - return join(tokens[pos], sep, sep_end) + return join(tokens[pos], separator, separator_last) end end @@ -104,7 +123,7 @@ end # subphrases/words # Example: # - for a vector: String[a, simpler, world, would, be, more, complicated], -# max_length=7 and sep='_', it would generate: +# max_length=7 and separator='_', it would generate: # String[a_simple_world_..._complicated, # a_simple_world_..._more, # ... @@ -119,15 +138,34 @@ end # ... # more_complicated, # complicated] -# TODO(Corneliu): Implement wildcard matching as well -function token_search(tokens, embeddings; sep::String="_", max_length::Int=3) +function token_search(conceptnet::ConceptNet{L,K,V}, + tokens::S; + language::L=Languages.English(), + separator::String="_", + max_length::Int=3, + wildcard_matching::Bool=false) where + {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} + # Initializations + if wildcard_matching + # Build function that checks whether a token is found in conceptnet + # using/or not wildcard matching + check_function = (conceptnet, language, token, default)-> + !isempty(get(conceptnet[language], # get from interface.jl + token, + default, + conceptnet.fuzzy_words[language])) + else + check_function = (conceptnet, language, token, default)-> + haskey(conceptnet[language], token) + end found = Vector{UnitRange{Int}}() n = length(tokens) i = 1 j = n while i <= n - token = join(tokens[i:j], sep, sep) - if haskey(embeddings, token) && j-i+1 <= max_length + token = join(tokens[i:j], separator, separator) + is_match = check_function(conceptnet, language, token, V()) + if is_match && j-i+1 <= max_length push!(found, i:j) i = j + 1 j = n diff --git a/test/runtests.jl b/test/runtests.jl index d9cf63a..30c79fd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -114,43 +114,48 @@ end conceptnet = load_embeddings(filepath, languages=[Languages.English()]) # Document with no matchable words doc = "a aaaaa b" - embedded_doc = embed_document(conceptnet, doc, keep_size=false, - max_compound_word_length=1, - search_mismatches=:no, - show_words=false) + embedded_doc, missed = embed_document(conceptnet, + doc, + keep_size=false, + max_compound_word_length=1, + search_mismatches=:no) @test embedded_doc isa Matrix{Float64} @test isempty(embedded_doc) - embedded_doc = embed_document(conceptnet, doc, keep_size=true, - max_compound_word_length=1, - search_mismatches=:no, - show_words=false) + @test length(missed) == 3 + embedded_doc, missed = embed_document(conceptnet, + doc, + keep_size=true, + max_compound_word_length=1, + search_mismatches=:no) @test embedded_doc isa Matrix{Float64} - @test size(embedded_doc, 2) == length( - ConceptnetNumberbatch.custom_tokenize(doc)) + @test size(embedded_doc, 2) == length(tokenize_for_conceptnet(doc)) + @test length(missed) == 3 # Document with all words matchable - doc_2 = "Five words: huge adapter, xxxyyyzish, 2342 metres ." - embedded_doc_2 = embed_document(conceptnet, doc_2, keep_size=false, - max_compound_word_length=2, - search_mismatches=:no, - show_words=false) + doc_2 = "Five words: huge adapter, xxyyzish, 2342 metres ." + embedded_doc_2, missed = embed_document(conceptnet, + doc_2, + keep_size=false, + max_compound_word_length=2, + search_mismatches=:no) @test embedded_doc_2 isa Matrix{Float64} - @test isempty(embedded_doc_2) # no exact matches (wildcard matching not supported yet) - embedded_doc_2 = embed_document(conceptnet, doc_2, keep_size=true, - max_compound_word_length=2, - search_mismatches=:no, - show_words=false) + @test isempty(embedded_doc_2) + @test length(missed) == length(tokenize_for_conceptnet(doc_2)) + embedded_doc_2, missed = embed_document(conceptnet, + doc_2, + keep_size=true, + max_compound_word_length=2, + search_mismatches=:no) @test embedded_doc_2 isa Matrix{Float64} - @test size(embedded_doc_2, 2) == length( - ConceptnetNumberbatch.custom_tokenize(doc_2)) - zero_cols = [4, 7] # zero columns - for i in size(embedded_doc_2, 2) - embedding = embedded_doc_2[:,i] - if i in zero_cols - @test all(iszero, embedding) - else - @test any(iszero, embedding) - end - end + @test size(embedded_doc_2, 2) == length(tokenize_for_conceptnet(doc_2)) + @test length(missed) == length(tokenize_for_conceptnet(doc_2)) + embedded_doc_2, missed = embed_document(conceptnet, + doc_2, + keep_size=true, + wildcard_matching=true, + max_compound_word_length=2, + search_mismatches=:no) + @show missed + @test length(missed) == 0 end # show methods From 6b24ae5a3f93391ba485f8270e97515ecd71886a Mon Sep 17 00:00:00 2001 From: Corneliu Cofaru Date: Tue, 23 Oct 2018 14:41:20 +0200 Subject: [PATCH 8/8] Updated README.md --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f720695..0ea2a97 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ julia> conceptnet = load_embeddings(file_conceptnet, languages=:en) # ConceptNet{Languages.English} (compressed): 1 language(s), 150875 embeddings julia> conceptnet["apple"] # Get embeddings for a single word -# 300×1 Array{Int8,2}: +# 300-element Array{Int8,1}: # 0 # 0 # 1 @@ -98,11 +98,27 @@ julia> # `keys` returns an iterator for all words # couvents ``` +Document embedding is quite straightforward: +```julia +julia> doc = "embed this document containing X_#-s231 which cannot be embedded" + edoc, idxs_missed = embed_document(conceptnet, doc, language=Languages.English(), keep_size=false) + missed_words = tokenize_for_conceptnet(doc)[idx_missed] + println("Missed word: $missed_word") + edoc +# Missed word: SubString{String}["X_#-s231"] +# 300×8 Array{Int8,2}: +# 0 0 0 0 0 1 0 0 +# -1 -2 -1 -1 -3 -2 -3 0 +# 1 5 0 4 6 6 6 2 +# ... +``` + ## Remarks - fast for retrieving embeddings of exact matches - fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`) + - fast document embedding - if neither exact or wildcard matches exist, retrieval can be based on string distances (slow, see `src/search.jl`) - for another package handling word embeddings, check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl)