From 8774aac1a2e2f31dbcd5af44909bff9cace5ea99 Mon Sep 17 00:00:00 2001
From: Corneliu Cofaru <cornel@oxoaresearch.com>
Date: Fri, 19 Oct 2018 14:36:33 +0200
Subject: [PATCH 1/8] Dependency updates

---
 Manifest.toml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 9ef3c4b..fc77a98 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -9,9 +9,9 @@ version = "0.8.10"
 
 [[BinaryProvider]]
 deps = ["Libdl", "Pkg", "SHA", "Test"]
-git-tree-sha1 = "48c147e63431adbcee69bc40b04c3f0fec0a4982"
+git-tree-sha1 = "9930c1a6cd49d9fcd7218df6be417e6ae4f1468a"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.0"
+version = "0.5.2"
 
 [[Blosc]]
 deps = ["BinaryProvider", "CMakeWrapper", "Compat", "Libdl"]
@@ -27,9 +27,9 @@ version = "1.0.0"
 
 [[CMake]]
 deps = ["BinDeps", "Libdl", "Pkg", "Test"]
-git-tree-sha1 = "4f0b34e12d4d2c6a367d62c73c961ad468d62b7b"
+git-tree-sha1 = "74853a75c26a4a73ac391ee26ee29ebeb5583d9f"
 uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a"
-version = "1.0.2"
+version = "1.1.0"
 
 [[CMakeWrapper]]
 deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Pkg", "Test"]
@@ -45,9 +45,9 @@ version = "0.5.0"
 
 [[Compat]]
 deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "ff2595695fc4f14427358ce2593f867085c45dcb"
+git-tree-sha1 = "2d9e14d19bad3f9ad5cc5e4cffabc3cfa59de825"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "1.2.0"
+version = "1.3.0"
 
 [[Dates]]
 deps = ["Printf"]
@@ -91,9 +91,9 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[IterTools]]
 deps = ["Pkg", "SparseArrays", "Test"]
-git-tree-sha1 = "ed0787e62dc46b8d8c7c3db54391d71e0da5fefd"
+git-tree-sha1 = "79246285c43602384e6f1943b3554042a3712056"
 uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
-version = "1.0.0"
+version = "1.1.1"
 
 [[JSON]]
 deps = ["Dates", "Distributed", "Mmap", "Pkg", "Sockets", "Test", "Unicode"]

From bc0250685744e533c3328ff16520a030971b6ff2 Mon Sep 17 00:00:00 2001
From: Corneliu Cofaru <cornel@oxoaresearch.com>
Date: Fri, 19 Oct 2018 14:51:07 +0200
Subject: [PATCH 2/8] Added constants to defaults.jl

---
 src/ConceptnetNumberbatch.jl | 53 +-----------------------------------
 src/defaults.jl              | 51 ++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 52 deletions(-)
 create mode 100644 src/defaults.jl

diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl
index 15566de..85e1f4d 100644
--- a/src/ConceptnetNumberbatch.jl
+++ b/src/ConceptnetNumberbatch.jl
@@ -29,58 +29,6 @@ using NearestNeighbors
 
 import Base: get, getindex, size, length, show, keys, values, in
 
-# Links pointing to the latest ConceptNetNumberbatch version (v"17.06")
-const CONCEPTNET_MULTI_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz"
-const CONCEPTNET_EN_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz"
-const CONCEPTNET_HDF5_LINK = "https://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5"
-
-# Accepted languages (map from conceptnet to Languages.Language)
-const LANGUAGES = Dict(:en=>Languages.English(),
-                       :fr=>Languages.French(),
-                       :de=>Languages.German(),
-                       :it=>Languages.Italian(),
-                       :fi=>Languages.Finnish(),
-                       :nl=>Languages.Dutch(),
-                       :af=>Languages.Dutch(),
-                       :pt=>Languages.Portuguese(),
-                       :es=>Languages.Spanish(),
-                       :ru=>Languages.Russian(),
-                       :sh=>Languages.Serbian(),# and Languages.Croatian()
-                       :sw=>Languages.Swedish(),
-                       :cs=>Languages.Czech(),
-                       :pl=>Languages.Polish(),
-                       :bg=>Languages.Bulgarian(),
-                       :eo=>Languages.Esperanto(),
-                       :hu=>Languages.Hungarian(),
-                       :el=>Languages.Greek(),
-                       :no=>Languages.Nynorsk(),
-                       :sl=>Languages.Slovene(),
-                       :ro=>Languages.Romanian(),
-                       :vi=>Languages.Vietnamese(),
-                       :lv=>Languages.Latvian(),
-                       :tr=>Languages.Turkish(),
-                       :da=>Languages.Danish(),
-                       :ar=>Languages.Arabic(),
-                       :fa=>Languages.Persian(),
-                       :ko=>Languages.Korean(),
-                       :th=>Languages.Thai(),
-                       :ka=>Languages.Georgian(),
-                       :he=>Languages.Hebrew(),
-                       :te=>Languages.Telugu(),
-                       :et=>Languages.Estonian(),
-                       :hi=>Languages.Hindi(),
-                       :lt=>Languages.Lithuanian(),
-                       :uk=>Languages.Ukrainian(),
-                       :be=>Languages.Belarusian(),
-                       :sw=>Languages.Swahili(),
-                       :ur=>Languages.Urdu(),
-                       :ku=>Languages.Kurdish(),
-                       :az=>Languages.Azerbaijani(),
-                       :ta=>Languages.Tamil()
-                       # add more mappings here if needed
-                       # AND supported by Languages.jl
-                      )
-
 export CONCEPTNET_MULTI_LINK,
        CONCEPTNET_EN_LINK,
        CONCEPTNET_HDF5_LINK,
@@ -89,6 +37,7 @@ export CONCEPTNET_MULTI_LINK,
        load_embeddings,
        phrase_embeddings
 
+include("defaults.jl")
 include("interface.jl")
 include("files.jl")
 include("search.jl")
diff --git a/src/defaults.jl b/src/defaults.jl
new file mode 100644
index 0000000..0d3c14d
--- /dev/null
+++ b/src/defaults.jl
@@ -0,0 +1,51 @@
+# Links pointing to the latest ConceptNetNumberbatch version (v"17.06")
+const CONCEPTNET_MULTI_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz"
+const CONCEPTNET_EN_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz"
+const CONCEPTNET_HDF5_LINK = "https://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5"
+
+# Accepted languages (map from conceptnet to Languages.Language)
+const LANGUAGES = Dict(:en=>Languages.English(),
+                       :fr=>Languages.French(),
+                       :de=>Languages.German(),
+                       :it=>Languages.Italian(),
+                       :fi=>Languages.Finnish(),
+                       :nl=>Languages.Dutch(),
+                       :af=>Languages.Dutch(),
+                       :pt=>Languages.Portuguese(),
+                       :es=>Languages.Spanish(),
+                       :ru=>Languages.Russian(),
+                       :sh=>Languages.Serbian(),# and Languages.Croatian()
+                       :sw=>Languages.Swedish(),
+                       :cs=>Languages.Czech(),
+                       :pl=>Languages.Polish(),
+                       :bg=>Languages.Bulgarian(),
+                       :eo=>Languages.Esperanto(),
+                       :hu=>Languages.Hungarian(),
+                       :el=>Languages.Greek(),
+                       :no=>Languages.Nynorsk(),
+                       :sl=>Languages.Slovene(),
+                       :ro=>Languages.Romanian(),
+                       :vi=>Languages.Vietnamese(),
+                       :lv=>Languages.Latvian(),
+                       :tr=>Languages.Turkish(),
+                       :da=>Languages.Danish(),
+                       :ar=>Languages.Arabic(),
+                       :fa=>Languages.Persian(),
+                       :ko=>Languages.Korean(),
+                       :th=>Languages.Thai(),
+                       :ka=>Languages.Georgian(),
+                       :he=>Languages.Hebrew(),
+                       :te=>Languages.Telugu(),
+                       :et=>Languages.Estonian(),
+                       :hi=>Languages.Hindi(),
+                       :lt=>Languages.Lithuanian(),
+                       :uk=>Languages.Ukrainian(),
+                       :be=>Languages.Belarusian(),
+                       :sw=>Languages.Swahili(),
+                       :ur=>Languages.Urdu(),
+                       :ku=>Languages.Kurdish(),
+                       :az=>Languages.Azerbaijani(),
+                       :ta=>Languages.Tamil()
+                       # add more mappings here if needed
+                       # AND supported by Languages.jl
+                      )

From 6c24b4fd7c0fd97d55e21f68f820651f5fd3845c Mon Sep 17 00:00:00 2001
From: Corneliu Cofaru <cornel@oxoaresearch.com>
Date: Fri, 19 Oct 2018 19:09:44 +0200
Subject: [PATCH 3/8] Improved speed of phrase embeddings

---
 src/interface.jl | 26 +++++++++++++---------
 src/search.jl    | 58 +++++++++++++++++++++++++++---------------------
 2 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/src/interface.jl b/src/interface.jl
index 79aa5d8..fda2637 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -38,9 +38,9 @@ show(io::IO, conceptnet::ConceptNetEnglish) =
 # Example: the embedding corresponding to "###_something" is returned for any search query
 #          of two words where the first word in made out out 3 letters followed by
 #          the word 'something'
-function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K}) where {K<:AbstractString, V<:AbstractVector}
-    words = keys(embeddings)
-    if keyword in words
+function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K}) where
+        {K<:AbstractString, V<:AbstractVector}
+    if haskey(embeddings, keyword)
         # The keyword exists in the dictionary
         return embeddings[keyword]
     else
@@ -73,14 +73,20 @@ end
 # Indexing
 # Generic indexing, multiple words
 # Example: julia> conceptnet[Languages.English(), ["another", "word"])
+# TODO(Make type stable!); make new get for keyword vectors
 getindex(conceptnet::ConceptNet{L,K,V}, language::L, words::S) where
-        {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} =
-    hcat((get(conceptnet.embeddings[language],
-              word,
-              zeros(eltype(V), conceptnet.width),
-              conceptnet.fuzzy_words[language])
-          for word in words)...
-        )::Matrix{eltype(V)}
+        {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} = begin
+    if !isempty(words)
+        hcat((get(conceptnet.embeddings[language],
+                  word,
+                  zeros(eltype(V), conceptnet.width),
+                  conceptnet.fuzzy_words[language])
+              for word in words)...
+            )::Matrix{eltype(V)}
+    else
+        Vector{eltype(V)}()
+    end
+end
 
 # Generic indexing, multiple words
 # Example: julia> conceptnet[:en, ["another", "word"]]
diff --git a/src/search.jl b/src/search.jl
index c769a6b..4b5ccff 100644
--- a/src/search.jl
+++ b/src/search.jl
@@ -11,38 +11,46 @@ function phrase_embeddings(conceptnet::ConceptNet,
                            distance=Levenshtein())
     # Initializations
     sep = "_"
-    tokens = split(phrase)
-    dictionary = collect(keys(conceptnet.embeddings[language]))
+    phrase_tokens = strip.(split(phrase))
+    embeddings = conceptnet.embeddings[language]
     # Generate positions of words that can be used for indexing (found)
     # and that can be searched (not_found)
-    found = token_search(tokens,
-                         dictionary,
+    found = token_search(phrase_tokens,
+                         embeddings,
                          sep=sep,
                          max_length=max_compound_word_length)
-    not_found = setdiff(1:length(tokens), found...)
     # Get found words
     words = Vector{String}()
     for pos in found
-        word = make_word_from_tokens(tokens, pos, sep, sep)
+        word = make_word_from_tokens(phrase_tokens, pos, sep, sep)
         push!(words, word)
     end
     # Get best matches for not found words
-    for pos in not_found
-        word = make_word_from_tokens(tokens, pos, sep, sep)
-        if search_mismatches == :no
-            # Insert not found words if exact matches are to be
-            # returned only if a matrix of width equal to the
-            # number of terms is to be returned
-            keep_size && push!(words, word)
-        elseif search_mismatches == :brute_force
-            matcher = dict_word->evaluate(distance, word, dict_word)
-            _, match_pos = findmin(map(matcher, dictionary))
-            push!(words, dictionary[match_pos])
-        else
-            @warn "The only supported approximate string matching" *
-                  " method is :brute_force. Use :no for skipping the" *
-                  " search; will not search."
-            push!(words, word)
+    words_not_found = setdiff(phrase_tokens, words)
+    if keep_size && !isempty(words_not_found)  # keep_size has precendence
+        for word in words_not_found
+            if search_mismatches == :no
+                # Insert not found words if exact matches are to be
+                # returned only if a matrix of width equal to the
+                # number of terms is to be returned
+                push!(words, word)
+            elseif search_mismatches == :brute_force
+                match_word = ""
+                distmin = Inf
+                for dict_word in keys(embeddings)
+                    dist = evaluate(distance, word, dict_word)
+                    if dist < distmin
+                        distmin = dist
+                        match_word = dict_word
+                    end
+                end
+                push!(words, match_word)
+            else
+                @warn "The only supported approximate string matching" *
+                      " method is :brute_force. Use :no for skipping the" *
+                      " search; will not search."
+                push!(words, word)
+            end
         end
     end
     # Return
@@ -62,7 +70,7 @@ function make_word_from_tokens(tokens, pos, sep, sep_end)
 end
 
 # Function that searches subphrases (continuous token combinations)
-# from a phrase in a dictionary and returns the positions of matched
+# from a phrase in a the embedded words and returns the positions of matched
 # subphrases/words
 # Example:
 #   - for a vector: String[a, simpler, world, would, be, more, complicated],
@@ -81,14 +89,14 @@ end
 #              ...
 #              more_complicated,
 #              complicated]
-function token_search(tokens, dictionary; sep::String="_", max_length::Int=3)
+function token_search(tokens, embeddings; sep::String="_", max_length::Int=3)
     found = Vector{UnitRange{Int}}()
     n = length(tokens)
     i = 1
     j = n
     while i <= n
         token = join(tokens[i:j], sep, sep)
-        if token in dictionary && j-i+1 <= max_length
+        if haskey(embeddings, token) && j-i+1 <= max_length
             push!(found, i:j)
             i = j + 1
             j = n

From 32620824535c65c83f3d746431bf2d93cdb1845c Mon Sep 17 00:00:00 2001
From: Corneliu Cofaru <cornel@oxoaresearch.com>
Date: Fri, 19 Oct 2018 20:43:22 +0200
Subject: [PATCH 4/8] Improved type stability

---
 src/interface.jl | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/interface.jl b/src/interface.jl
index fda2637..c89cf0f 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -38,7 +38,7 @@ show(io::IO, conceptnet::ConceptNetEnglish) =
 # Example: the embedding corresponding to "###_something" is returned for any search query
 #          of two words where the first word in made out out 3 letters followed by
 #          the word 'something'
-function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K}) where
+function get(embeddings::Dict{K,V}, keyword::K, default::V, fuzzy_words::Vector{K}) where
         {K<:AbstractString, V<:AbstractVector}
     if haskey(embeddings, keyword)
         # The keyword exists in the dictionary
@@ -68,25 +68,27 @@ function get(embeddings::Dict{K,V}, keyword, default::V, fuzzy_words::Vector{K})
     end
 end
 
-
+function get(embeddings::Dict{K,V}, keywords::AbstractVector{K}, default::V, fuzzy_words::Vector{K};
+             n::Int=0) where
+        {K<:AbstractString, V<:AbstractVector}
+    p = length(keywords)
+    keywords_embedded = Matrix{eltype(V)}(undef, n, p)
+    for i in 1:p
+        keywords_embedded[:,i] = get(embeddings, keywords[i], default, fuzzy_words)
+    end
+    return keywords_embedded
+end
 
 # Indexing
 # Generic indexing, multiple words
 # Example: julia> conceptnet[Languages.English(), ["another", "word"])
-# TODO(Make type stable!); make new get for keyword vectors
 getindex(conceptnet::ConceptNet{L,K,V}, language::L, words::S) where
-        {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} = begin
-    if !isempty(words)
-        hcat((get(conceptnet.embeddings[language],
-                  word,
-                  zeros(eltype(V), conceptnet.width),
-                  conceptnet.fuzzy_words[language])
-              for word in words)...
-            )::Matrix{eltype(V)}
-    else
-        Vector{eltype(V)}()
-    end
-end
+        {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} =
+    get(conceptnet.embeddings[language],
+        words,
+        zeros(eltype(V), conceptnet.width),
+        conceptnet.fuzzy_words[language],
+        n=conceptnet.width)
 
 # Generic indexing, multiple words
 # Example: julia> conceptnet[:en, ["another", "word"]]
@@ -98,13 +100,16 @@ getindex(conceptnet::ConceptNet{L,K,V}, language::Symbol, words::S) where
 # Example: julia> conceptnet[Languages.English(), "word"]
 getindex(conceptnet::ConceptNet{L,K,V}, language::L, word::S) where
         {L<:Language, K, V, S<:AbstractString} =
-    conceptnet[language, [word]]
+    get(conceptnet.embeddings[language],
+        word,
+        zeros(eltype(V), conceptnet.width),
+        conceptnet.fuzzy_words[language])
 
 # Generic indexing, single word
 # Example: julia> conceptnet[:en, "word"]
 getindex(conceptnet::ConceptNet{L,K,V}, language::Symbol, word::S) where
         {L<:Language, K, V, S<:AbstractString} =
-    conceptnet[LANGUAGES[language], [word]]
+    conceptnet[LANGUAGES[language], word]
 
 # Single-language indexing: conceptnet[["another", "word"]], if language==Languages.English()
 getindex(conceptnet::ConceptNet{L,K,V}, words::S) where
@@ -114,7 +119,7 @@ getindex(conceptnet::ConceptNet{L,K,V}, words::S) where
 # Single-language indexing: conceptnet["word"], if language==Languages.English()
 getindex(conceptnet::ConceptNet{L,K,V}, word::S) where
         {L<:Languages.Language, K, V, S<:AbstractString} =
-    conceptnet[L(), [word]]
+    conceptnet[L(), word]
 
 # Index by language (returns a Dict{word=>embedding})
 getindex(conceptnet::ConceptNet, language::L) where {L<:Languages.Language} =

From 080cf657c0f88282165bdfb52ea34707e96f9830 Mon Sep 17 00:00:00 2001
From: Corneliu Cofaru <cornel@oxoaresearch.com>
Date: Mon, 22 Oct 2018 11:24:19 +0200
Subject: [PATCH 5/8] Many changes: refactored document embedding and added
 tests, improved tokenization, deactivated word models

---
 Manifest.toml                             | 12 -------
 Project.toml                              |  1 -
 REQUIRE                                   |  1 -
 src/ConceptnetNumberbatch.jl              |  5 ++-
 src/defaults.jl                           |  3 ++
 src/{search.jl => document_embeddings.jl} | 42 ++++++++++++++--------
 src/files.jl                              | 26 ++++++++------
 src/interface.jl                          |  3 +-
 src/word_model.jl                         |  7 ++++
 test/runtests.jl                          | 44 +++++++++++++++++++++++
 10 files changed, 101 insertions(+), 43 deletions(-)
 rename src/{search.jl => document_embeddings.jl} (73%)

diff --git a/Manifest.toml b/Manifest.toml
index fc77a98..57b8b67 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -145,12 +145,6 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
-[[NearestNeighbors]]
-deps = ["Distances", "LinearAlgebra", "Mmap", "Pkg", "StaticArrays", "Test"]
-git-tree-sha1 = "aab46b96ae5c2a9c08146188016d6312276094e5"
-uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
-version = "0.4.2"
-
 [[OrderedCollections]]
 deps = ["Pkg", "Random", "Serialization", "Test"]
 git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
@@ -196,12 +190,6 @@ uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
-[[StaticArrays]]
-deps = ["InteractiveUtils", "LinearAlgebra", "Random", "Statistics", "Test"]
-git-tree-sha1 = "d432c79bef174a830304f8601427a4357dfdbfb7"
-uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.8.3"
-
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/Project.toml b/Project.toml
index 8b632f1..88040fe 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,7 +7,6 @@ version = "0.1.0"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
-NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
diff --git a/REQUIRE b/REQUIRE
index a3ddeca..4b31580 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -4,4 +4,3 @@ CodecZlib
 HDF5
 Languages
 StringDistances
-NearestNeighbors
diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl
index 85e1f4d..9bb2ee4 100644
--- a/src/ConceptnetNumberbatch.jl
+++ b/src/ConceptnetNumberbatch.jl
@@ -25,7 +25,6 @@ using CodecZlib
 using HDF5
 using Languages
 using StringDistances
-using NearestNeighbors
 
 import Base: get, getindex, size, length, show, keys, values, in
 
@@ -35,11 +34,11 @@ export CONCEPTNET_MULTI_LINK,
        ConceptNet,
        download_embeddings,
        load_embeddings,
-       phrase_embeddings
+       embed_document
 
 include("defaults.jl")
 include("interface.jl")
 include("files.jl")
-include("search.jl")
+include("document_embeddings.jl")
 
 end # module
diff --git a/src/defaults.jl b/src/defaults.jl
index 0d3c14d..5be2f29 100644
--- a/src/defaults.jl
+++ b/src/defaults.jl
@@ -49,3 +49,6 @@ const LANGUAGES = Dict(:en=>Languages.English(),
                        # add more mappings here if needed
                        # AND supported by Languages.jl
                       )
+
+# Regular expression on which to split text into tokens
+const DEFAULT_SPLITTER = r"(,|:|\\|\/|;|\.|\[|\]|\{|\}|\"|\"|\s+)"
diff --git a/src/search.jl b/src/document_embeddings.jl
similarity index 73%
rename from src/search.jl
rename to src/document_embeddings.jl
index 4b5ccff..6c811dd 100644
--- a/src/search.jl
+++ b/src/document_embeddings.jl
@@ -1,32 +1,45 @@
 """
-Retrieves the embedding matrix for a given `phrase`.
+Fast tokenization function.
 """
-function phrase_embeddings(conceptnet::ConceptNet,
-                           phrase::S where S<:AbstractString;
-                           language=Languages.English(),
-                           keep_size::Bool=true,
-                           max_compound_word_length::Int=1,
-                           search_mismatches::Symbol=:no,
-                           show_words::Bool=true,
-                           distance=Levenshtein())
+function custom_tokenize(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER)
+    # First, split
+    tokens = strip.(split(doc, splitter))
+    # Filter out empty strings
+    filter!(!isempty, tokens)
+end
+
+
+
+"""
+Retrieves the embedding matrix for a given `document`.
+"""
+function embed_document(conceptnet::ConceptNet,
+                        document::S where S<:AbstractString;
+                        language=Languages.English(),
+                        keep_size::Bool=true,
+                        max_compound_word_length::Int=1,
+                        search_mismatches::Symbol=:no,
+                        show_words::Bool=true,
+                        distance=Levenshtein())
     # Initializations
     sep = "_"
-    phrase_tokens = strip.(split(phrase))
     embeddings = conceptnet.embeddings[language]
+    # Split into tokens
+    document_tokens = custom_tokenize(document)
     # Generate positions of words that can be used for indexing (found)
     # and that can be searched (not_found)
-    found = token_search(phrase_tokens,
+    found = token_search(document_tokens,
                          embeddings,
                          sep=sep,
                          max_length=max_compound_word_length)
     # Get found words
     words = Vector{String}()
     for pos in found
-        word = make_word_from_tokens(phrase_tokens, pos, sep, sep)
+        word = make_word_from_tokens(document_tokens, pos, sep, sep)
         push!(words, word)
     end
     # Get best matches for not found words
-    words_not_found = setdiff(phrase_tokens, words)
+    words_not_found = setdiff(document_tokens, words)
     if keep_size && !isempty(words_not_found)  # keep_size has precendence
         for word in words_not_found
             if search_mismatches == :no
@@ -70,7 +83,7 @@ function make_word_from_tokens(tokens, pos, sep, sep_end)
 end
 
 # Function that searches subphrases (continuous token combinations)
-# from a phrase in a the embedded words and returns the positions of matched
+# from a document in a the embedded words and returns the positions of matched
 # subphrases/words
 # Example:
 #   - for a vector: String[a, simpler, world, would, be, more, complicated],
@@ -89,6 +102,7 @@ end
 #              ...
 #              more_complicated,
 #              complicated]
+# TODO(Corneliu): Implement wildcard matching as well
 function token_search(tokens, embeddings; sep::String="_", max_length::Int=3)
     found = Vector{UnitRange{Int}}()
     n = length(tokens)
diff --git a/src/files.jl b/src/files.jl
index f1bf513..cf7024a 100644
--- a/src/files.jl
+++ b/src/files.jl
@@ -1,6 +1,6 @@
 """
-Downloads embeddings given a `url` and saves them to a file
-pointed to by `localfile`.
+Download ConceptNetNumberbatch embeddings given a `url` and saves them
+to a file pointed to by `localfile`.
 """
 function download_embeddings(;url=CONCEPTNET_EN_LINK,
                              localfile=abspath("./_conceptnet_/" *
@@ -20,7 +20,7 @@ end
 
 
 """
-Function that loads the embeddings given a valid ConceptNetNumberbatch `filepath`,
+Load the embeddings given a valid ConceptNetNumberbatch `filepath`,
 lading at most `max_vocab_size` embeddings if no specific `keep_words` are
 specified, filtering on `languages`.
 """
@@ -63,8 +63,9 @@ function load_embeddings(filepath::AbstractString;
 end
 
 
-
-# Loads the ConceptNetNumberbatch from a .gz or uncompressed file
+"""
+Load the ConceptNetNumberbatch embeddings from a .gz or uncompressed file.
+"""
 function _load_gz_embeddings(filepath::S1,
                              decompressor::TranscodingStreams.Codec,
                              max_vocab_size::Union{Nothing,Int},
@@ -118,8 +119,9 @@ function _load_gz_embeddings(filepath::S1,
 end
 
 
-
-# Loads the ConceptNetNumberbatch from a HDF5 file
+"""
+Load the ConceptNetNumberbatch embeddings from a HDF5 file.
+"""
 function _load_hdf5_embeddings(filepath::S1,
                                max_vocab_size::Union{Nothing,Int},
                                keep_words::Vector{S2};
@@ -206,8 +208,9 @@ function process_language_argument(languages::Vector{L},
 end
 
 
-
-# Function that calculates how many embeddings to retreive
+"""
+Calculate how many embeddings to retreive.
+"""
 function _get_vocab_size(real_vocab_size,
                          max_vocab_size=nothing,
                          keep_words=String[])
@@ -231,8 +234,9 @@ function _get_vocab_size(real_vocab_size,
 end
 
 
-
-# Parse a line
+"""
+Parse a line of text from a ConceptNetNumberbatch delimited file.
+"""
 function _parseline(buf; word_only=false)
     bufvec = split(buf, " ")
     word = string(popfirst!(bufvec))
diff --git a/src/interface.jl b/src/interface.jl
index c89cf0f..ae6bddd 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -8,7 +8,6 @@ ConceptNet(embeddings::Dict{K,V}, width::Int) where
         {K<:AbstractString, V<:AbstractVector} =
     ConceptNet{Languages.English(), K, V}(embeddings, width, Dict(Languages.English()=>K[]))
 
-
 # Aliases
 const ConceptNetMulti{L} = ConceptNet{L, String, Vector{Float64}}
 const ConceptNetMultiCompressed{L} = ConceptNet{L, String, Vector{Int8}}
@@ -79,6 +78,8 @@ function get(embeddings::Dict{K,V}, keywords::AbstractVector{K}, default::V, fuz
     return keywords_embedded
 end
 
+
+
 # Indexing
 # Generic indexing, multiple words
 # Example: julia> conceptnet[Languages.English(), ["another", "word"])
diff --git a/src/word_model.jl b/src/word_model.jl
index d08e8f1..b74faff 100644
--- a/src/word_model.jl
+++ b/src/word_model.jl
@@ -1,3 +1,8 @@
+# EXPERIMENTAL, not to be used!
+# To use, uncomment the code below and add NearestNeighbors.
+
+#=
+
 # ngram NN-model for faster approximate string matching
 # So far it is not really workable, results are bad
 # An example can be found in "./scripts/test_word_model.jl"
@@ -70,3 +75,5 @@ function get_ngrams(words::Vector{S}, n::Int=2) where S<:AbstractString
     end
     return unique(ngrams)
 end
+
+=#
diff --git a/test/runtests.jl b/test/runtests.jl
index 7e6f267..d9cf63a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -109,6 +109,50 @@ end
     end
 end
 
+@testset "Document Embedding" begin
+    filepath = joinpath(string(@__DIR__), "data", "_test_file_en.txt.gz")
+    conceptnet = load_embeddings(filepath, languages=[Languages.English()])
+    # Document with no matchable words
+    doc = "a aaaaa b"
+    embedded_doc = embed_document(conceptnet, doc, keep_size=false,
+                                  max_compound_word_length=1,
+                                  search_mismatches=:no,
+                                  show_words=false)
+    @test embedded_doc isa Matrix{Float64}
+    @test isempty(embedded_doc)
+    embedded_doc = embed_document(conceptnet, doc, keep_size=true,
+                                  max_compound_word_length=1,
+                                  search_mismatches=:no,
+                                  show_words=false)
+    @test embedded_doc isa Matrix{Float64}
+    @test size(embedded_doc, 2) == length(
+        ConceptnetNumberbatch.custom_tokenize(doc))
+    # Document with all words matchable
+    doc_2 = "Five words: huge adapter, xxxyyyzish, 2342 metres ."
+    embedded_doc_2 = embed_document(conceptnet, doc_2, keep_size=false,
+                                    max_compound_word_length=2,
+                                    search_mismatches=:no,
+                                    show_words=false)
+    @test embedded_doc_2 isa Matrix{Float64}
+    @test isempty(embedded_doc_2)  # no exact matches (wildcard matching not supported yet)
+    embedded_doc_2 = embed_document(conceptnet, doc_2, keep_size=true,
+                                    max_compound_word_length=2,
+                                    search_mismatches=:no,
+                                    show_words=false)
+    @test embedded_doc_2 isa Matrix{Float64}
+    @test size(embedded_doc_2, 2) == length(
+        ConceptnetNumberbatch.custom_tokenize(doc_2))
+    zero_cols = [4, 7]  # zero columns
+    for i in size(embedded_doc_2, 2)
+        embedding = embedded_doc_2[:,i]
+        if i in zero_cols
+            @test all(iszero, embedding)
+        else
+            @test any(iszero, embedding)
+        end
+    end
+end
+
 # show methods
 @testset "Show methods" begin
     buf = IOBuffer()

From caa19898315d0fdc5de2697c8338d53f8076f45d Mon Sep 17 00:00:00 2001
From: Corneliu Cofaru <cornel@oxoaresearch.com>
Date: Mon, 22 Oct 2018 12:35:47 +0200
Subject: [PATCH 6/8] Added new method for document embedding

---
 src/document_embeddings.jl | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/document_embeddings.jl b/src/document_embeddings.jl
index 6c811dd..930d15b 100644
--- a/src/document_embeddings.jl
+++ b/src/document_embeddings.jl
@@ -14,18 +14,35 @@ end
 Retrieves the embedding matrix for a given `document`.
 """
 function embed_document(conceptnet::ConceptNet,
-                        document::S where S<:AbstractString;
+                        document::AbstractString;
                         language=Languages.English(),
                         keep_size::Bool=true,
                         max_compound_word_length::Int=1,
                         search_mismatches::Symbol=:no,
                         show_words::Bool=true,
                         distance=Levenshtein())
+    # Split document into tokens and embed
+    return embed_document(conceptnet,
+                          custom_tokenize(document),
+                          language=language,
+                          keep_size=keep_size,
+                          max_compound_word_length=max_compound_word_length,
+                          search_mismatches=search_mismatches,
+                          show_words=show_words,
+                          distance=distance)
+end
+
+function embed_document(conceptnet::ConceptNet,
+                        document_tokens::Vector{S};
+                        language=Languages.English(),
+                        keep_size::Bool=true,
+                        max_compound_word_length::Int=1,
+                        search_mismatches::Symbol=:no,
+                        show_words::Bool=true,
+                        distance=Levenshtein()) where S<:AbstractString
     # Initializations
     sep = "_"
     embeddings = conceptnet.embeddings[language]
-    # Split into tokens
-    document_tokens = custom_tokenize(document)
     # Generate positions of words that can be used for indexing (found)
     # and that can be searched (not_found)
     found = token_search(document_tokens,

From 930e7c731bef6d380bddac8b25139c30327b2d2c Mon Sep 17 00:00:00 2001
From: Corneliu Cofaru <cornel@oxoaresearch.com>
Date: Tue, 23 Oct 2018 14:10:25 +0200
Subject: [PATCH 7/8] Improved document embedding function

---
 src/ConceptnetNumberbatch.jl |  1 +
 src/document_embeddings.jl   | 96 +++++++++++++++++++++++++-----------
 test/runtests.jl             | 67 +++++++++++++------------
 3 files changed, 104 insertions(+), 60 deletions(-)

diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl
index 9bb2ee4..a80651c 100644
--- a/src/ConceptnetNumberbatch.jl
+++ b/src/ConceptnetNumberbatch.jl
@@ -34,6 +34,7 @@ export CONCEPTNET_MULTI_LINK,
        ConceptNet,
        download_embeddings,
        load_embeddings,
+       tokenize_for_conceptnet,
        embed_document
 
 include("defaults.jl")
diff --git a/src/document_embeddings.jl b/src/document_embeddings.jl
index 930d15b..0084fc0 100644
--- a/src/document_embeddings.jl
+++ b/src/document_embeddings.jl
@@ -1,7 +1,7 @@
 """
 Fast tokenization function.
 """
-function custom_tokenize(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER)
+function tokenize_for_conceptnet(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER)
     # First, split
     tokens = strip.(split(doc, splitter))
     # Filter out empty strings
@@ -17,18 +17,22 @@ function embed_document(conceptnet::ConceptNet,
                         document::AbstractString;
                         language=Languages.English(),
                         keep_size::Bool=true,
+                        compound_word_separator::String="_",
                         max_compound_word_length::Int=1,
+                        wildcard_matching::Bool=false,
                         search_mismatches::Symbol=:no,
-                        show_words::Bool=true,
+                        print_matched_words::Bool=false,
                         distance=Levenshtein())
     # Split document into tokens and embed
     return embed_document(conceptnet,
-                          custom_tokenize(document),
+                          tokenize_for_conceptnet(document),
                           language=language,
                           keep_size=keep_size,
+                          compound_word_separator=compound_word_separator,
                           max_compound_word_length=max_compound_word_length,
+                          wildcard_matching=wildcard_matching,
                           search_mismatches=search_mismatches,
-                          show_words=show_words,
+                          print_matched_words=print_matched_words,
                           distance=distance)
 end
 
@@ -36,34 +40,42 @@ function embed_document(conceptnet::ConceptNet,
                         document_tokens::Vector{S};
                         language=Languages.English(),
                         keep_size::Bool=true,
+                        compound_word_separator::String="_",
                         max_compound_word_length::Int=1,
+                        wildcard_matching::Bool=false,
                         search_mismatches::Symbol=:no,
-                        show_words::Bool=true,
+                        print_matched_words::Bool=false,
                         distance=Levenshtein()) where S<:AbstractString
     # Initializations
-    sep = "_"
     embeddings = conceptnet.embeddings[language]
-    # Generate positions of words that can be used for indexing (found)
-    # and that can be searched (not_found)
-    found = token_search(document_tokens,
-                         embeddings,
-                         sep=sep,
-                         max_length=max_compound_word_length)
+    # Get positions of words that can be used for indexing (found)
+    # and those of words that can be searched (not_found)
+    found_positions = token_search(conceptnet,
+                                   document_tokens;
+                                   language=language,
+                                   separator=compound_word_separator,
+                                   max_length=max_compound_word_length,
+                                   wildcard_matching=wildcard_matching)
     # Get found words
-    words = Vector{String}()
-    for pos in found
-        word = make_word_from_tokens(document_tokens, pos, sep, sep)
-        push!(words, word)
+    found_words = Vector{String}()
+    for pos in found_positions
+        word = make_word_from_tokens(document_tokens,
+                                     pos,
+                                     separator=compound_word_separator,
+                                     separator_last=compound_word_separator)
+        push!(found_words, word)
     end
     # Get best matches for not found words
-    words_not_found = setdiff(document_tokens, words)
+    not_found_positions = setdiff(1:length(document_tokens),
+                                  collect.(found_positions)...)
+    words_not_found = document_tokens[not_found_positions]
     if keep_size && !isempty(words_not_found)  # keep_size has precendence
         for word in words_not_found
             if search_mismatches == :no
                 # Insert not found words if exact matches are to be
                 # returned only if a matrix of width equal to the
                 # number of terms is to be returned
-                push!(words, word)
+                push!(found_words, word)
             elseif search_mismatches == :brute_force
                 match_word = ""
                 distmin = Inf
@@ -74,28 +86,35 @@ function embed_document(conceptnet::ConceptNet,
                         match_word = dict_word
                     end
                 end
-                push!(words, match_word)
+                push!(found_words, match_word)
             else
                 @warn "The only supported approximate string matching" *
                       " method is :brute_force. Use :no for skipping the" *
                       " search; will not search."
-                push!(words, word)
+                push!(found_words, word)
             end
         end
     end
     # Return
-    show_words && @show words
-    return conceptnet[language, words]
+    if print_matched_words
+        println("Embedded words: $found_words")
+        println("Mismatched words: $words_not_found")
+    end
+    return conceptnet[language, found_words], not_found_positions
 end
 
 
 
 # Small function that builds a compound word
-function make_word_from_tokens(tokens, pos, sep, sep_end)
+function make_word_from_tokens(tokens::Vector{S},
+                               pos;
+                               separator::String="_",
+                               separator_last::String="_") where
+        S<:AbstractString
     if length(pos) == 1
         return join(tokens[pos])
     else
-        return join(tokens[pos], sep, sep_end)
+        return join(tokens[pos], separator, separator_last)
     end
 end
 
@@ -104,7 +123,7 @@ end
 # subphrases/words
 # Example:
 #   - for a vector: String[a, simpler, world, would, be, more, complicated],
-#     max_length=7 and sep='_', it would generate:
+#     max_length=7 and separator='_', it would generate:
 #       String[a_simple_world_..._complicated,
 #              a_simple_world_..._more,
 #              ...
@@ -119,15 +138,34 @@ end
 #              ...
 #              more_complicated,
 #              complicated]
-# TODO(Corneliu): Implement wildcard matching as well
-function token_search(tokens, embeddings; sep::String="_", max_length::Int=3)
+function token_search(conceptnet::ConceptNet{L,K,V},
+                      tokens::S;
+                      language::L=Languages.English(),
+                      separator::String="_",
+                      max_length::Int=3,
+                      wildcard_matching::Bool=false) where
+        {L<:Language, K, V, S<:AbstractVector{<:AbstractString}}
+    # Initializations
+    if wildcard_matching
+        # Build function that checks whether a token is found in conceptnet
+        # using/or not wildcard matching
+        check_function = (conceptnet, language, token, default)->
+                            !isempty(get(conceptnet[language],  # get from interface.jl
+                                         token,
+                                         default,
+                                         conceptnet.fuzzy_words[language]))
+    else
+        check_function = (conceptnet, language, token, default)->
+                            haskey(conceptnet[language], token)
+    end
     found = Vector{UnitRange{Int}}()
     n = length(tokens)
     i = 1
     j = n
     while i <= n
-        token = join(tokens[i:j], sep, sep)
-        if haskey(embeddings, token) && j-i+1 <= max_length
+        token = join(tokens[i:j], separator, separator)
+        is_match = check_function(conceptnet, language, token, V())
+        if is_match && j-i+1 <= max_length
             push!(found, i:j)
             i = j + 1
             j = n
diff --git a/test/runtests.jl b/test/runtests.jl
index d9cf63a..30c79fd 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -114,43 +114,48 @@ end
     conceptnet = load_embeddings(filepath, languages=[Languages.English()])
     # Document with no matchable words
     doc = "a aaaaa b"
-    embedded_doc = embed_document(conceptnet, doc, keep_size=false,
-                                  max_compound_word_length=1,
-                                  search_mismatches=:no,
-                                  show_words=false)
+    embedded_doc, missed = embed_document(conceptnet,
+                                          doc,
+                                          keep_size=false,
+                                          max_compound_word_length=1,
+                                          search_mismatches=:no)
     @test embedded_doc isa Matrix{Float64}
     @test isempty(embedded_doc)
-    embedded_doc = embed_document(conceptnet, doc, keep_size=true,
-                                  max_compound_word_length=1,
-                                  search_mismatches=:no,
-                                  show_words=false)
+    @test length(missed) == 3
+    embedded_doc, missed = embed_document(conceptnet,
+                                          doc,
+                                          keep_size=true,
+                                          max_compound_word_length=1,
+                                          search_mismatches=:no)
     @test embedded_doc isa Matrix{Float64}
-    @test size(embedded_doc, 2) == length(
-        ConceptnetNumberbatch.custom_tokenize(doc))
+    @test size(embedded_doc, 2) == length(tokenize_for_conceptnet(doc))
+    @test length(missed) == 3
     # Document with all words matchable
-    doc_2 = "Five words: huge adapter, xxxyyyzish, 2342 metres ."
-    embedded_doc_2 = embed_document(conceptnet, doc_2, keep_size=false,
-                                    max_compound_word_length=2,
-                                    search_mismatches=:no,
-                                    show_words=false)
+    doc_2 = "Five words: huge adapter, xxyyzish, 2342 metres ."
+    embedded_doc_2, missed = embed_document(conceptnet,
+                                            doc_2,
+                                            keep_size=false,
+                                            max_compound_word_length=2,
+                                            search_mismatches=:no)
     @test embedded_doc_2 isa Matrix{Float64}
-    @test isempty(embedded_doc_2)  # no exact matches (wildcard matching not supported yet)
-    embedded_doc_2 = embed_document(conceptnet, doc_2, keep_size=true,
-                                    max_compound_word_length=2,
-                                    search_mismatches=:no,
-                                    show_words=false)
+    @test isempty(embedded_doc_2)
+    @test length(missed) == length(tokenize_for_conceptnet(doc_2))
+    embedded_doc_2, missed = embed_document(conceptnet,
+                                            doc_2,
+                                            keep_size=true,
+                                            max_compound_word_length=2,
+                                            search_mismatches=:no)
     @test embedded_doc_2 isa Matrix{Float64}
-    @test size(embedded_doc_2, 2) == length(
-        ConceptnetNumberbatch.custom_tokenize(doc_2))
-    zero_cols = [4, 7]  # zero columns
-    for i in size(embedded_doc_2, 2)
-        embedding = embedded_doc_2[:,i]
-        if i in zero_cols
-            @test all(iszero, embedding)
-        else
-            @test any(iszero, embedding)
-        end
-    end
+    @test size(embedded_doc_2, 2) == length(tokenize_for_conceptnet(doc_2))
+    @test length(missed) == length(tokenize_for_conceptnet(doc_2))
+    embedded_doc_2, missed = embed_document(conceptnet,
+                                            doc_2,
+                                            keep_size=true,
+                                            wildcard_matching=true,
+                                            max_compound_word_length=2,
+                                            search_mismatches=:no)
+    @show missed
+    @test length(missed) == 0
 end
 
 # show methods

From 6b24ae5a3f93391ba485f8270e97515ecd71886a Mon Sep 17 00:00:00 2001
From: Corneliu Cofaru <cornel@oxoaresearch.com>
Date: Tue, 23 Oct 2018 14:41:20 +0200
Subject: [PATCH 8/8] Updated README.md

---
 README.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f720695..0ea2a97 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ julia> conceptnet = load_embeddings(file_conceptnet, languages=:en)
 # ConceptNet{Languages.English} (compressed): 1 language(s), 150875 embeddings
 
 julia> conceptnet["apple"]  # Get embeddings for a single word
-# 300×1 Array{Int8,2}:
+# 300-element Array{Int8,1}:
 #   0
 #   0
 #   1
@@ -98,11 +98,27 @@ julia> # `keys` returns an iterator for all words
 # couvents
 ```
 
+Document embedding is quite straightforward:
+```julia
+julia> doc = "embed this document containing X_#-s231 which cannot be embedded"
+       edoc, idxs_missed = embed_document(conceptnet, doc, language=Languages.English(), keep_size=false)
+       missed_words = tokenize_for_conceptnet(doc)[idx_missed]
+       println("Missed word: $missed_word")
+       edoc
+# Missed word: SubString{String}["X_#-s231"]
+# 300×8 Array{Int8,2}:
+#   0   0   0   0   0   1   0   0
+#  -1  -2  -1  -1  -3  -2  -3   0
+#   1   5   0   4   6   6   6   2
+# ...
+```
+
 
 ## Remarks
 
  - fast for retrieving embeddings of exact matches
  - fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`)
+ - fast document embedding
  - if neither exact or wildcard matches exist, retrieval can be based on string distances (slow, see `src/search.jl`)
  - for another package handling word embeddings, check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl)