Skip to content

Commit

Permalink
Merge pull request #2 from zgornel/indexing_search_and_improvements
Browse files Browse the repository at this point in the history
Indexing search and improvements
  • Loading branch information
Corneliu Cofaru authored Sep 26, 2018
2 parents 7ff75db + ab926f8 commit b2b2d89
Show file tree
Hide file tree
Showing 16 changed files with 665 additions and 171 deletions.
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ git:

## uncomment the following lines to allow failures on nightly julia
## (tests will run but not make your overall status red)
#matrix:
# allow_failures:
# - julia: nightly
matrix:
allow_failures:
- os: osx # travis timing out

after_success:
# push coverage results to Coveralls
Expand Down
92 changes: 61 additions & 31 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ version = "0.8.10"

[[BinaryProvider]]
deps = ["Libdl", "Pkg", "SHA", "Test"]
git-tree-sha1 = "b530fbeb6f41ab5a83fbe3db1fcbe879334bcd2d"
git-tree-sha1 = "48c147e63431adbcee69bc40b04c3f0fec0a4982"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.4.2"
version = "0.5.0"

[[Blosc]]
deps = ["BinaryProvider", "CMakeWrapper", "Compat", "Libdl"]
Expand All @@ -26,10 +26,10 @@ uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
version = "1.0.0"

[[CMake]]
deps = ["BinDeps", "Libdl", "Test"]
git-tree-sha1 = "573cf0e6029444bf26353eaf2b293369e888ca46"
deps = ["BinDeps", "Libdl", "Pkg", "Test"]
git-tree-sha1 = "4f0b34e12d4d2c6a367d62c73c961ad468d62b7b"
uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a"
version = "1.0.1"
version = "1.0.2"

[[CMakeWrapper]]
deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Pkg", "Test"]
Expand All @@ -45,15 +45,9 @@ version = "0.5.0"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "ae262fa91da6a74e8937add6b613f58cd56cdad4"
git-tree-sha1 = "ff2595695fc4f14427358ce2593f867085c45dcb"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "1.1.0"

[[DataStructures]]
deps = ["InteractiveUtils", "REPL", "Random", "Serialization", "Test"]
git-tree-sha1 = "2afbbd0294306b0b74a753c196be50b35edb625c"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.11.1"
version = "1.2.0"

[[Dates]]
deps = ["Printf"]
Expand All @@ -63,15 +57,21 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"

[[Distances]]
deps = ["LinearAlgebra", "Pkg", "Printf", "Random", "Statistics", "Test"]
git-tree-sha1 = "2f38605722542f1c0a32dd2856fb529d8c226c69"
uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
version = "0.7.3"

[[Distributed]]
deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"

[[HDF5]]
deps = ["BinDeps", "Blosc", "Distributed", "Homebrew", "Libdl", "LinearAlgebra", "Pkg", "Test", "WinRPM"]
git-tree-sha1 = "2407ec97bdb872ed0d569ff8e592c7a5d60675bc"
deps = ["BinDeps", "Blosc", "Distributed", "Homebrew", "Libdl", "LinearAlgebra", "Mmap", "Pkg", "Test", "WinRPM"]
git-tree-sha1 = "8c3bcdb44db436cd20106e2381e1c1ac96aa0ee3"
uuid = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
version = "0.10.0"
version = "0.10.2"

[[HTTPClient]]
deps = ["Compat", "LibCURL"]
Expand All @@ -89,17 +89,29 @@ version = "0.7.0"
deps = ["LinearAlgebra", "Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[IterTools]]
deps = ["Pkg", "SparseArrays", "Test"]
git-tree-sha1 = "ed0787e62dc46b8d8c7c3db54391d71e0da5fefd"
uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
version = "1.0.0"

[[JSON]]
deps = ["Dates", "Distributed", "Mmap", "Pkg", "Sockets", "Test", "Unicode"]
git-tree-sha1 = "fec8e4d433072731466d37ed0061b3ba7f70eeb9"
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
version = "0.19.0"

[[Languages]]
deps = ["JSON", "Test"]
git-tree-sha1 = "460302a55509ae9aef8222bdc0c2a6482d8930f3"
uuid = "8ef0a80b-9436-5d2c-a485-80b904378c43"
version = "0.4.0"

[[LibCURL]]
deps = ["BinaryProvider", "Compat", "Libdl", "Pkg"]
git-tree-sha1 = "f7c4fd37291207ec6bd4b7b9abfc512d4bb8b88e"
deps = ["BinaryProvider", "Compat", "Libdl", "Pkg", "Printf"]
git-tree-sha1 = "6339c87cb76923a3cf947fcd213cbc364355c9c9"
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.3.1"
version = "0.4.1"

[[LibExpat]]
deps = ["Compat", "Pkg"]
Expand Down Expand Up @@ -133,11 +145,23 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"

[[NearestNeighbors]]
deps = ["Distances", "LinearAlgebra", "Mmap", "Pkg", "StaticArrays", "Test"]
git-tree-sha1 = "aab46b96ae5c2a9c08146188016d6312276094e5"
uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
version = "0.4.2"

[[OrderedCollections]]
deps = ["Pkg", "Random", "Serialization", "Test"]
git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.0.2"

[[Parameters]]
deps = ["Compat", "DataStructures", "REPL"]
git-tree-sha1 = "9554e6665968d1ff6f5342b188475163b05e527d"
deps = ["Markdown", "OrderedCollections", "Pkg", "REPL", "Test"]
git-tree-sha1 = "40f540ec96e50c0b2b9efdb11b5e4d0c63f90923"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.9.2"
version = "0.10.1"

[[Pkg]]
deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
Expand All @@ -147,12 +171,6 @@ uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[[ProgressMeter]]
deps = ["Printf", "Random", "Test"]
git-tree-sha1 = "09e653f4b0a3c44628f0bdd0a0e58bc92e0264ef"
uuid = "92933f4c-e287-5a05-a399-4b506db050ca"
version = "0.6.0"

[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
Expand All @@ -178,10 +196,22 @@ uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[[StaticArrays]]
deps = ["InteractiveUtils", "LinearAlgebra", "Random", "Statistics", "Test"]
git-tree-sha1 = "d432c79bef174a830304f8601427a4357dfdbfb7"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "0.8.3"

[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[StringDistances]]
deps = ["Distances", "IterTools", "Test"]
git-tree-sha1 = "41fddd579b75e0cd0d1bbdb2d68a2a9cc588c164"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.3.0"

[[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand All @@ -206,7 +236,7 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[WinRPM]]
deps = ["BinDeps", "Compat", "HTTPClient", "LibExpat", "Libdl", "Libz", "SHA", "URIParser"]
git-tree-sha1 = "023faa2430b4ad3f9cf073c03cf100d80630fb6d"
deps = ["BinDeps", "Compat", "HTTPClient", "LibExpat", "Libdl", "Libz", "URIParser"]
git-tree-sha1 = "2a889d320f3b77d17c037f295859fe570133cfbf"
uuid = "c17dfb99-b4f7-5aad-8812-456da1ad7187"
version = "0.4.0"
version = "0.4.2"
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ version = "0.0.1"
[deps]
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
18 changes: 12 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,34 @@ An Julia interface to [ConceptNetNumberbatch](https://github.com/commonsense/con
## Introduction

This package is a simple API to *ConceptNetNumberbatch*.
TODO

## Documentation

TODO
There is little documentation available however these examples illustrate some common usage patterns:
- TO DO: Add usage examples



## Limitations and Caveats

TODO
## Remarks

- pretty fast for retrieving an existing word
- slow for retrieving a mismatch
- could be wrong for mismatches
- retrieval is based on string distances
- it is not possible to retrieve embeddings from multiple distinct languages at the same time (in a single indexing operation)
- decreasing the vocabulary size based on language (i.e. detect the language of the text before searching) may increase performance significantly at the cost of more mismatches for rare words


## Installation

The installation can be done through the usual channels (manually by cloning the repository or installing it though the julia `REPL`).



## Remarks

At this point this is a work in progress and should NOT be used.
At this point this is a work in progress and should NOT be used. For an alternative to this
package (with respect to word embeddings), check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl)



Expand Down
6 changes: 6 additions & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
julia 0.7
TranscodingStreams
CodecZlib
HDF5
Languages
StringDistances
NearestNeighbors
21 changes: 21 additions & 0 deletions scripts/_benchmark_string_distances.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using Pkg; Pkg.activate(".");
using StringDistances
using ConceptnetNumberbatch
using BenchmarkTools
using Random
using Languages
using Serialization

fid = open("./_conceptnet_/numberbatch-en-17.06.txt.bin")
cptnet = deserialize(fid)
close(fid)
words = (key for key in keys(cptnet[Languages.English()]) if isascii(key))

target="sstring"
for dist in [Jaro(), Levenshtein(), DamerauLevenshtein(), Cosine(), QGram(2), QGram(3)]
_, idx = findmin(map(x->evaluate(dist, target, x), words))
println("---------------")
@time _, idx = findmin(map(x->evaluate(dist, target, x), words))
println("[$dist], best match: $(collect(words)[idx])")
end

21 changes: 21 additions & 0 deletions scripts/get_word_embeddings.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using Pkg
Pkg.activate(".")
using ConceptnetNumberbatch
using Serialization

local cptnet

# Load a serialized version of ConceptNetEnglish
fid = open("./_conceptnet_/numberbatch-en-17.06.txt.bin")
cptnet = deserialize(fid)
close(fid)



## Get an embedding matrix
phrase = "this is a phrase that containz some iwords"
ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=false)
@time embs=ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=false)
@time embs=ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=true)
println("Loaded $(size(embs, 2)) embedding vectors (out of $(length(split(phrase)))), $(size(embs,1)) elements each.")

20 changes: 20 additions & 0 deletions scripts/test_word_embeddings.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
using Pkg
Pkg.activate(".")
using ConceptnetNumberbatch
using Serialization

local cptnet

# Load a serialized version of ConceptNetEnglish
fid = open("./_conceptnet_/numberbatch-en-17.06.txt.bin")
cptnet = deserialize(fid)
close(fid)



## Get an embedding matrix
phrase = "this is a phrase that containz some iwords"
ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=false)
@time embs=ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=true)
println("Loaded $(size(embs, 2)) embedding vectors (out of $(length(split(phrase))), $(size(embs,1)) elements each.")

23 changes: 23 additions & 0 deletions scripts/test_word_model.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using Pkg
Pkg.activate(".")
using ConceptnetNumberbatch
using Serialization

# Load a serialized version of ConceptNetEnglish
fid = open("./_conceptnet_/numberbatch-en-17.06.txt.bin")
cptnet = deserialize(fid)
close(fid)

## get similar words from NN q-gram model
dictionary = collect(keys(cptnet))
@time modelstuff = ConceptnetNumberbatch.build_nn_model(dictionary, ngram_size=2)
open("./_conceptnet_/model.bin", "w") do fid
serialize(fid, modelstuff)
end
#modelstuff = open(deserialize, "./_conceptnet_/model.bin", "r")
targets = ["phrasis", "appled", "fdellity", "moanster"]
found = ConceptnetNumberbatch.get_similar_words(targets[1], modelstuff...)
for target in targets
@time found = ConceptnetNumberbatch.get_similar_words(target, modelstuff...)
println("$target => $(found[1])")
end
Loading

0 comments on commit b2b2d89

Please sign in to comment.