diff --git a/Project.toml b/Project.toml index 9557c64..8b632f1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ConceptnetNumberbatch" uuid = "2d1d9008-b762-11e8-11f1-375fdd6dca71" authors = ["Corneliu Cofaru "] -version = "0.0.1" +version = "0.1.0" [deps] CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" diff --git a/README.md b/README.md index 080cf2d..f720695 100644 --- a/README.md +++ b/README.md @@ -10,23 +10,101 @@ An Julia interface to [ConceptNetNumberbatch](https://github.com/commonsense/con ## Introduction -This package is a simple API to *ConceptNetNumberbatch*. +This package is a simple API to **ConceptNetNumberbatch**. + -## Documentation -There is little documentation available however these examples illustrate some common usage patterns: - - TO DO: Add usage examples +## Documentation +The following examples illustrate some common usage patterns: + +```julia +julia> using Conceptnet, Languages + file_conceptnet = download_embeddings(url=CONCEPTNET_HDF5_LINK, + localfile="./_conceptnet_/conceptnet.h5"); +# [ Info: Download ConceptNetNumberbatch to ./_conceptnet_/conceptnet.h5... +# % Total % Received % Xferd Average Speed Time Time Time Current +# Dload Upload Total Spent Left Speed +# 100 127M 100 127M 0 0 3646k 0 0:00:35 0:00:35 --:--:-- 4107k +# "./_conceptnet_/conceptnet.h5" + +# Load embeddings +julia> conceptnet = load_embeddings(file_conceptnet, languages=:en) +# ConceptNet{Languages.English} (compressed): 1 language(s), 150875 embeddings + +julia> conceptnet["apple"] # Get embeddings for a single word +# 300×1 Array{Int8,2}: +# 0 +# 0 +# 1 +# -4 +# ... + +julia> conceptnet[["apple", "pear", "cherry"]] # Get embeddings for multiple words +# 300×3 Array{Int8,2}: +# 0 0 0 +# 0 0 0 +# 1 1 1 +# -4 -6 -7 +# ... +``` + +```julia +# Load multiple languages +julia> conceptnet = load_embeddings(file_conceptnet, languages=[:en, :fr]) +# ConceptNet{Language} (compressed): 2 language(s), 174184 embeddings + +julia> conceptnet["apple"] # fails, language must be specified +# ERROR: ... + +julia> [conceptnet[:en, "apple"] conceptnet[:fr, "poire"]] +# 300×2 Array{Int8,2}: +# 0 -2 +# 0 -2 +# 1 -2 +# -4 -7 +# ... + +# Wildcard matching +julia> conceptnet[:en, "xxyyzish"] # returns embedding for "#####ish" +# 300×1 Array{Int8,2}: +# 5 +# -1 +# 0 +# 1 +# ... +``` + +```julia +# Useful functions +julia> length(conceptnet) # total number of embeddings for all languages +# 174184 + +julia> size(conceptnet) # embedding vector length, number of embeddings +# (300, 174184) + +julia> "apple" in conceptnet # found in the English embeddings +# true + +julia> "poire" in conceptnet # found in the French embeddings +# true + +julia> # `keys` returns an iterator for all words + for word in Iterators.take(keys(conceptnet),3) + println(word) + end +# définie +# invités +# couvents +``` ## Remarks - - pretty fast for retrieving an existing word - - slow for retrieving a mismatch - - could be wrong for mismatches - - retrieval is based on string distances - - it is not possible to retrieve embeddings from multiple distinct languages at the same time (in a single indexing operation) - - decreasing the vocabulary size based on language (i.e. detect the language of the text before searching) may increase performance significantly at the cost of more mismatches for rare words + - fast for retrieving embeddings of exact matches + - fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`) + - if neither exact or wildcard matches exist, retrieval can be based on string distances (slow, see `src/search.jl`) + - for another package handling word embeddings, check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl) ## Installation @@ -35,13 +113,6 @@ The installation can be done through the usual channels (manually by cloning the -## Remarks - -At this point this is a work in progress and should NOT be used. For an alternative to this -package (with respect to word embeddings), check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl) - - - ## License This code has an MIT license and therefore it is free. @@ -51,4 +122,7 @@ This code has an MIT license and therefore it is free. ## References [1] [ConceptNetNumberbatch GitHub homepage](https://github.com/commonsense/conceptnet-numberbatch) + [2] [ConceptNet GitHub homepage](https://github.com/commonsense/conceptnet5) + +[3] [Embeddings.jl GitHub homepage](https://github.com/JuliaText/Embeddings.jl) diff --git a/REQUIRE b/REQUIRE index cfab78a..a3ddeca 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,4 +1,4 @@ -julia 0.7 +julia 1.0 TranscodingStreams CodecZlib HDF5 diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl index 1aca404..15566de 100644 --- a/src/ConceptnetNumberbatch.jl +++ b/src/ConceptnetNumberbatch.jl @@ -1,27 +1,22 @@ -################################################################################ -# ConceptnetNumberbatch.jl - an interface for ConceptNetNumberbatch # -# written in Julia by Cornel Cofaru at 0x0α Research, 2018 # -# # -# Paper: # -# Robert Speer, Joshua Chin, and Catherine Havasi (2017). # -# "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." # -# In proceedings of AAAI 2017. # -################################################################################ +# MMMMMMMMMMMMMMMMMMMMMMMMMMMWNNKN0KMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM +# MMMMMMMMMMMMMMMMMMMMMMMMMW0OMMMMX0MMMMMMMMMMXXNMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMNWMMMMNMMMMMMMMMMMMMMMMMMMMMMMM +# MMMMMMMMMMMMMMMMMMMMMMMMMKMMMMMOKWMMMMMMMN:lkkdXMMNXWMMMWMNNMMMMWXWMMMNNMMMWMNNMMMlXWM;.xMMW.MMMWXWMM0dWWMMMMMMMMMMMM +# MMMMMMMMMMMMMMMMMMMMMMMMNXMMMMMXXMMMMMMMM:oMMMMMX,xkl:MK.xkd'MN,dkxMx:Ox,NW dkd'N0 xOM;OccWW.MW;dOcoN:;kNMMMMMMMMMMMM +# MMMMMMMMMMMMMMMMMMWXXXXXkMMMMMMM0MMMMMMMM:lMMMMMocMMM.XK.MMM Mk;MMMM.lxxlKM MMM;xM.NMM;OMk,X.MO.xxdoMolMMMMMMMMMMMMMM +# MMMMMMMMMMMMMMXXXXNMMMMMoMMMMMMMOMMMMMMMMW:cdxdWN;dxlcMK.MMM MW;oxdMk;dxdWM dxo,NM;lkM;OMMX'.MW:oxd0MO'xXMMMMMMMMMMMM +# MMMMMMMMMMMMXXWMMMMMMMMM0KMMMMMOOMMMMMMMMMMMWWMMMMMWMMMMMMMMMMMMMWMMMMWWMMM MWWMMMMWWMMMMMMMMMMMMWWMMMMWMMMMMMMMMMMMM +# MMMMMMMMMMW0WMMMMMMMMMMMMNMMMM0:WMMMMMMMMXxxNMM0kMMMMMMMMMMMMMMMMMMMMXl0MMMxMMMMMMMMMMMMModMMMMMMMMMMMMW0WMMMMMMMMdoM +# MMMMMMMMMM0MMMMMMMMMMMMMMMMMMk'0MMMMMMMMM0cxlXMxoMNxKMWd0M0xOxdOkxd0MXcxkdkWM0xkx0MNd0xxMlokdxNMNxkxkNNxcxkMXxxxOMdlk +# MMMMMMMMMMKMMMMMMMMMNK0XNMMMM:'OWMMMMMMMM0cNOlKkoMXcOMWckMkcXMxcNMdlMXcOMXcOXcdOxcKNcxWMMloMWooMN0OkcOMOcNMWldMMMMdlW +# NKOkk0KK0OkMMMMMMWd;:kWMWXX0kdo;'OMMMMMMM0cNMKlloMNlxXOckMkcWMxcMMdlMXcxXOcKNlxXXKWNcOMMMllKKlxMloX0cOM0c0XMooKXKMdoM +# MMMMWX0kdl:0MMMMX;'lWMMMMMMWk;'0XWMMMMMMMN0WMMX0KMMX00XKXMX0WMX0MMKKMW0N00XMMWK00KMW0XMMMKXK0KWMX00NKXMWK00MWK00XMKKM +# MMMMMMMMMMMWOOoc'o0KWMMMMMMMMWK0MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -# Remarks: -# #### -# /-----\ -# | O ^ | -# | \_/ | -# \___/ -# - pretty fast for retrieving an existing word -# - slow for retrieving a mismatch -# - could be wrong for mismatches -# - retrieval is based on string distances -# - decreasing the vocabulary size based on language -# (i.e. detect the language of the text before -# searching) may increase performance significantly at the cost -# of more mismatches for rare words +# ConceptnetNumberbatch.jl - an interface for ConceptNetNumberbatch written in Julia at 0x0α Research, +# by Corneliu Cofaru, 2018 +# Paper: +# Robert Speer, Joshua Chin, and Catherine Havasi (2017). +# "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." in proceedings of AAAI 2017. module ConceptnetNumberbatch @@ -32,7 +27,7 @@ using Languages using StringDistances using NearestNeighbors -import Base: getindex, size, length, show, keys, values, in +import Base: get, getindex, size, length, show, keys, values, in # Links pointing to the latest ConceptNetNumberbatch version (v"17.06") const CONCEPTNET_MULTI_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz" @@ -40,21 +35,51 @@ const CONCEPTNET_EN_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/n const CONCEPTNET_HDF5_LINK = "https://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5" # Accepted languages (map from conceptnet to Languages.Language) -const LANG_MAP = Dict(:en=>Languages.English(), - :fr=>Languages.French(), - :de=>Languages.German(), - :it=>Languages.Italian(), - :fi=>Languages.Finnish(), - :nl=>Languages.Dutch(), - :af=>Languages.Dutch(), - :pt=>Languages.Portuguese(), - :es=>Languages.Spanish(), - :ru=>Languages.Russian(), - :ro=>Languages.Romanian(), - :sw=>Languages.Swedish() - # add more mappings here if needed - # AND supported by Languages.jl - ) +const LANGUAGES = Dict(:en=>Languages.English(), + :fr=>Languages.French(), + :de=>Languages.German(), + :it=>Languages.Italian(), + :fi=>Languages.Finnish(), + :nl=>Languages.Dutch(), + :af=>Languages.Dutch(), + :pt=>Languages.Portuguese(), + :es=>Languages.Spanish(), + :ru=>Languages.Russian(), + :sh=>Languages.Serbian(),# and Languages.Croatian() + :sw=>Languages.Swedish(), + :cs=>Languages.Czech(), + :pl=>Languages.Polish(), + :bg=>Languages.Bulgarian(), + :eo=>Languages.Esperanto(), + :hu=>Languages.Hungarian(), + :el=>Languages.Greek(), + :no=>Languages.Nynorsk(), + :sl=>Languages.Slovene(), + :ro=>Languages.Romanian(), + :vi=>Languages.Vietnamese(), + :lv=>Languages.Latvian(), + :tr=>Languages.Turkish(), + :da=>Languages.Danish(), + :ar=>Languages.Arabic(), + :fa=>Languages.Persian(), + :ko=>Languages.Korean(), + :th=>Languages.Thai(), + :ka=>Languages.Georgian(), + :he=>Languages.Hebrew(), + :te=>Languages.Telugu(), + :et=>Languages.Estonian(), + :hi=>Languages.Hindi(), + :lt=>Languages.Lithuanian(), + :uk=>Languages.Ukrainian(), + :be=>Languages.Belarusian(), + :sw=>Languages.Swahili(), + :ur=>Languages.Urdu(), + :ku=>Languages.Kurdish(), + :az=>Languages.Azerbaijani(), + :ta=>Languages.Tamil() + # add more mappings here if needed + # AND supported by Languages.jl + ) export CONCEPTNET_MULTI_LINK, CONCEPTNET_EN_LINK, diff --git a/src/files.jl b/src/files.jl index a237e00..900d11c 100644 --- a/src/files.jl +++ b/src/files.jl @@ -5,8 +5,8 @@ pointed to by `localfile`. function download_embeddings(;url=CONCEPTNET_EN_LINK, localfile=abspath("./_conceptnet_/" * split(url,"/")[end])) - _dir = join(split(localfile, "/")[1:end-1], "/") - !isempty(_dir) && !isdir(_dir) && mkpath(_dir) + directory = join(split(localfile, "/")[1:end-1], "/") + !isempty(directory) && !isdir(directory) && mkpath(directory) @info "Download ConceptNetNumberbatch to $localfile..." if !isfile(localfile) download(url, localfile) @@ -29,10 +29,16 @@ function load_embeddings(filepath::AbstractString; keep_words=String[], languages::Union{Nothing, Languages.Language, - Vector{<:Languages.Language} + Vector{<:Languages.Language}, + Symbol, + Vector{Symbol} }=nothing) - if languages == nothing - languages = unique(collect(values(LANG_MAP))) + if languages isa Nothing + languages = unique(collect(values(LANGUAGES))) + elseif languages isa Symbol + languages = LANGUAGES[languages] + elseif languages isa Vector{Symbol} + languages = [LANGUAGES[lang] for lang in languages] end if any(endswith.(filepath, [".gz", ".gzip"])) @@ -68,7 +74,7 @@ function _load_gz_embeddings(filepath::S1, Vector{<:Languages.Language} }=nothing) where {S1<:AbstractString, S2<:AbstractString} - local lang_embs, _length::Int, _width::Int, type_lang + local lang_embs, _length::Int, _width::Int, type_lang, fuzzy_words type_word = String type_vector = Vector{Float64} open(filepath, "r") do fid @@ -79,6 +85,7 @@ function _load_gz_embeddings(filepath::S1, keep_words) lang_embs, languages, type_lang, english_only = process_language_argument(languages, type_word, type_vector) + fuzzy_words = Dict{type_lang, Vector{type_word}}() no_custom_words = length(keep_words)==0 lang = :en cnt = 0 @@ -89,12 +96,14 @@ function _load_gz_embeddings(filepath::S1, lang = Symbol(_lang) end if word in keep_words || no_custom_words - if lang in keys(LANG_MAP) && LANG_MAP[lang] in languages # use only languages mapped in LANG_MAP - _llang = LANG_MAP[lang] + if lang in keys(LANGUAGES) && LANGUAGES[lang] in languages # use only languages mapped in LANGUAGES + _llang = LANGUAGES[lang] if !(_llang in keys(lang_embs)) push!(lang_embs, _llang=>Dict{type_word, type_vector}()) + push!(fuzzy_words, _llang=>type_word[]) end _, embedding = _parseline(line, word_only=false) + occursin("#", word) && push!(fuzzy_words[_llang], word) push!(lang_embs[_llang], word=>embedding) cnt+=1 if cnt > vocab_size-1 @@ -105,7 +114,7 @@ function _load_gz_embeddings(filepath::S1, end close(cfid) end - return ConceptNet{type_lang, type_word, type_vector}(lang_embs, _width), _length, _width + return ConceptNet{type_lang, type_word, type_vector}(lang_embs, _width, fuzzy_words) end @@ -119,6 +128,7 @@ function _load_hdf5_embeddings(filepath::S1, Vector{<:Languages.Language} }=nothing) where {S1<:AbstractString, S2<:AbstractString} + local fuzzy_words type_word = String type_vector = Vector{Int8} payload = h5open(read, filepath)["mat"] @@ -132,15 +142,18 @@ function _load_hdf5_embeddings(filepath::S1, keep_words) lang_embs, languages, type_lang, _ = process_language_argument(languages, type_word, type_vector) + fuzzy_words = Dict{type_lang, Vector{type_word}}() no_custom_words = length(keep_words)==0 cnt = 0 for (idx, (lang, word)) in enumerate(words) if word in keep_words || no_custom_words - if lang in keys(LANG_MAP) && LANG_MAP[lang] in languages # use only languages mapped in LANG_MAP - _llang = LANG_MAP[lang] + if lang in keys(LANGUAGES) && LANGUAGES[lang] in languages # use only languages mapped in LANGUAGES + _llang = LANGUAGES[lang] if !(_llang in keys(lang_embs)) push!(lang_embs, _llang=>Dict{type_word, type_vector}()) + push!(fuzzy_words, _llang=>type_word[]) end + occursin("#", word) && push!(fuzzy_words[_llang], word) push!(lang_embs[_llang], word=>embeddings[:,idx]) cnt+=1 if cnt > vocab_size-1 @@ -149,9 +162,7 @@ function _load_hdf5_embeddings(filepath::S1, end end end - _length::Int = length(words) - _width::Int = size(embeddings,1) - return ConceptNet{type_lang, type_word, type_vector}(lang_embs, _width), _length, _width + return ConceptNet{type_lang, type_word, type_vector}(lang_embs, size(embeddings,1), fuzzy_words) end @@ -167,7 +178,7 @@ function process_language_argument(languages::Nothing, type_word::T1, type_vector::T2) where {T1, T2} return Dict{Languages.Language, Dict{type_word, type_vector}}(), - collect(language for language in LANG_MAP), + collect(language for language in LANGUAGES), Languages.Language, false end diff --git a/src/interface.jl b/src/interface.jl index 350ac26..cca4ec2 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -1,11 +1,12 @@ struct ConceptNet{L<:Language, K<:AbstractString, V<:AbstractVector} embeddings::Dict{L, Dict{K,V}} width::Int + fuzzy_words::Dict{L, Vector{K}} end ConceptNet(embeddings::Dict{K,V}, width::Int) where {K<:AbstractString, V<:AbstractVector} = - ConceptNet{Languages.English(), K, V}(embeddings, width) + ConceptNet{Languages.English(), K, V}(embeddings, width, Dict(Languages.English()=>K[])) # Aliases @@ -33,15 +34,51 @@ show(io::IO, conceptnet::ConceptNetEnglish) = -# Indexing +# Overloaded `get` method for a ConceptNet language dictionary +# Example: the embedding corresponding to "###_something" is returned for any search query +# of two words where the first word in made out out 3 letters followed by +# the word 'something' +function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K}) where {K<:AbstractString, V<:AbstractVector} + words = keys(embeddings) + if keyword in words + # The keyword exists in the dictionary + return embeddings[keyword] + else + # The keyword is not found; try fuzzy matching + ω = 0.4 # weight assinged to matching a #, 1-w weight assigned to a matching letter + L = length(keyword) + matches = (word for word in fuzzy_words + if length(word) == L && + occursin(Regex(replace(word,"#"=>".")), keyword)) + if isempty(matches) + return default + else + best_match = "" + max_score = 0 + for match in matches + l = length(replace(match,"#"=>"")) # number of letters matched + score = ω*(L-l)/L + (1-ω)*l/L + if score > max_score + best_match = match + max_score = score + end + end + return embeddings[best_match] + end + end +end + + +# Indexing # Generic indexing, multiple words # Example: julia> conceptnet[Languages.English(), ["another", "word"]) getindex(conceptnet::ConceptNet{L,K,V}, language::L, words::S) where {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} = hcat((get(conceptnet.embeddings[language], word, - zeros(eltype(V), conceptnet.width)) + zeros(eltype(V), conceptnet.width), + conceptnet.fuzzy_words[language]) for word in words)... )::Matrix{eltype(V)} @@ -49,7 +86,7 @@ getindex(conceptnet::ConceptNet{L,K,V}, language::L, words::S) where # Example: julia> conceptnet[:en, ["another", "word"]] getindex(conceptnet::ConceptNet{L,K,V}, language::Symbol, words::S) where {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} = - conceptnet[LANG_MAP[language], words] + conceptnet[LANGUAGES[language], words] # Generic indexing, single word # Example: julia> conceptnet[Languages.English(), "word"] @@ -61,7 +98,7 @@ getindex(conceptnet::ConceptNet{L,K,V}, language::L, word::S) where # Example: julia> conceptnet[:en, "word"] getindex(conceptnet::ConceptNet{L,K,V}, language::Symbol, word::S) where {L<:Language, K, V, S<:AbstractString} = - conceptnet[LANG_MAP[language], [word]] + conceptnet[LANGUAGES[language], [word]] # Single-language indexing: conceptnet[["another", "word"]], if language==Languages.English() getindex(conceptnet::ConceptNet{L,K,V}, words::S) where @@ -79,7 +116,7 @@ getindex(conceptnet::ConceptNet, language::L) where {L<:Languages.Language} = # Index by language (returns a Dict{word=>embedding}) getindex(conceptnet::ConceptNet, language::Symbol) = - conceptnet.embeddings[LANG_MAP[language]] + conceptnet.embeddings[LANGUAGES[language]] diff --git a/src/search.jl b/src/search.jl index 41395d0..c769a6b 100644 --- a/src/search.jl +++ b/src/search.jl @@ -86,19 +86,19 @@ function token_search(tokens, dictionary; sep::String="_", max_length::Int=3) n = length(tokens) i = 1 j = n - while i<=n + while i <= n token = join(tokens[i:j], sep, sep) if token in dictionary && j-i+1 <= max_length push!(found, i:j) - i=j+1 - j=n + i = j + 1 + j = n continue else - if i==j - j=n - i+=1 + if i == j + j = n + i+= 1 else - j-=1 + j-= 1 end end end diff --git a/test/runtests.jl b/test/runtests.jl index 93da73f..7e6f267 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,20 +27,16 @@ const CONCEPTNET_TEST_DATA = Dict( # filename => output type @testset "Parser: (no arguments)" begin for (filename, (languages, _, resulting_type)) in CONCEPTNET_TEST_DATA - conceptnet, _len, _width = load_embeddings(filename, languages=languages); + conceptnet = load_embeddings(filename, languages=languages); @test conceptnet isa resulting_type - @test _len isa Int - @test _width isa Int - @test _width == size(conceptnet, 1) end end max_vocab_size=5 @testset "Parser: max_vocab_size=5" begin for (filename, (languages, _, _)) in CONCEPTNET_TEST_DATA - conceptnet, _len, _width = load_embeddings(filename, - max_vocab_size=max_vocab_size, - languages=languages); + conceptnet = load_embeddings(filename, max_vocab_size=max_vocab_size, + languages=languages); @test length(conceptnet) == max_vocab_size end end @@ -48,10 +44,8 @@ end max_vocab_size=5 @testset "Parser: max_vocab_size=5, 3 keep words" begin for (filename, (languages, keep_words, _)) in CONCEPTNET_TEST_DATA - conceptnet, _len, _width = load_embeddings(filename, - max_vocab_size=max_vocab_size, - keep_words=keep_words, - languages=languages) + conceptnet = load_embeddings(filename, max_vocab_size=max_vocab_size, + keep_words=keep_words, languages=languages) @test length(conceptnet) == length(keep_words) for word in keep_words @test word in conceptnet @@ -62,7 +56,7 @@ end @testset "Indexing" begin # English language filepath = joinpath(string(@__DIR__), "data", "_test_file_en.txt.gz") - conceptnet, _, _ = load_embeddings(filepath, languages=[Languages.English()]) + conceptnet = load_embeddings(filepath, languages=[Languages.English()]) words = ["####_ish", "####_form", "####_metres", "not_found", "not_found2"] # Test indexing @@ -82,7 +76,7 @@ end # Multiple languages filepath = joinpath(string(@__DIR__), "data", "_test_file.txt") - conceptnet, _, _ = load_embeddings(filepath, languages=nothing) + conceptnet = load_embeddings(filepath, languages=nothing) words = ["1_konings", "aaklig", "aak", "maggunfully"] # Test indexing @@ -103,6 +97,17 @@ end end end +@testset "Fuzzy Indexing" begin + filepath = joinpath(string(@__DIR__), "data", "_test_file_en.txt.gz") + conceptnet = load_embeddings(filepath, languages=[Languages.English()]) + words_and_matches = Dict("aq" => "##", + "p'"=>"##", + "ab," =>"###", + "ddsaw_metres"=>"#####_metres") + for (word, matching_word) in words_and_matches + @test conceptnet[word] == conceptnet[matching_word] + end +end # show methods @testset "Show methods" begin