-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from zgornel/indexing_search_and_improvements
Indexing search and improvements
- Loading branch information
Showing
16 changed files
with
665 additions
and
171 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,7 @@ | ||
julia 0.7 | ||
TranscodingStreams | ||
CodecZlib | ||
HDF5 | ||
Languages | ||
StringDistances | ||
NearestNeighbors |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
using Pkg; Pkg.activate("."); | ||
using StringDistances | ||
using ConceptnetNumberbatch | ||
using BenchmarkTools | ||
using Random | ||
using Languages | ||
using Serialization | ||
|
||
fid = open("./_conceptnet_/numberbatch-en-17.06.txt.bin") | ||
cptnet = deserialize(fid) | ||
close(fid) | ||
words = (key for key in keys(cptnet[Languages.English()]) if isascii(key)) | ||
|
||
target="sstring" | ||
for dist in [Jaro(), Levenshtein(), DamerauLevenshtein(), Cosine(), QGram(2), QGram(3)] | ||
_, idx = findmin(map(x->evaluate(dist, target, x), words)) | ||
println("---------------") | ||
@time _, idx = findmin(map(x->evaluate(dist, target, x), words)) | ||
println("[$dist], best match: $(collect(words)[idx])") | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
using Pkg | ||
Pkg.activate(".") | ||
using ConceptnetNumberbatch | ||
using Serialization | ||
|
||
local cptnet | ||
|
||
# Load a serialized version of ConceptNetEnglish | ||
fid = open("./_conceptnet_/numberbatch-en-17.06.txt.bin") | ||
cptnet = deserialize(fid) | ||
close(fid) | ||
|
||
|
||
|
||
## Get an embedding matrix | ||
phrase = "this is a phrase that containz some iwords" | ||
ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=false) | ||
@time embs=ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=false) | ||
@time embs=ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=true) | ||
println("Loaded $(size(embs, 2)) embedding vectors (out of $(length(split(phrase)))), $(size(embs,1)) elements each.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
using Pkg | ||
Pkg.activate(".") | ||
using ConceptnetNumberbatch | ||
using Serialization | ||
|
||
local cptnet | ||
|
||
# Load a serialized version of ConceptNetEnglish | ||
fid = open("./_conceptnet_/numberbatch-en-17.06.txt.bin") | ||
cptnet = deserialize(fid) | ||
close(fid) | ||
|
||
|
||
|
||
## Get an embedding matrix | ||
phrase = "this is a phrase that containz some iwords" | ||
ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=false) | ||
@time embs=ConceptnetNumberbatch.word_embeddings(cptnet, phrase, keep_size=false, search_mismatches=true) | ||
println("Loaded $(size(embs, 2)) embedding vectors (out of $(length(split(phrase))), $(size(embs,1)) elements each.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
using Pkg | ||
Pkg.activate(".") | ||
using ConceptnetNumberbatch | ||
using Serialization | ||
|
||
# Load a serialized version of ConceptNetEnglish | ||
fid = open("./_conceptnet_/numberbatch-en-17.06.txt.bin") | ||
cptnet = deserialize(fid) | ||
close(fid) | ||
|
||
## get similar words from NN q-gram model | ||
dictionary = collect(keys(cptnet)) | ||
@time modelstuff = ConceptnetNumberbatch.build_nn_model(dictionary, ngram_size=2) | ||
open("./_conceptnet_/model.bin", "w") do fid | ||
serialize(fid, modelstuff) | ||
end | ||
#modelstuff = open(deserialize, "./_conceptnet_/model.bin", "r") | ||
targets = ["phrasis", "appled", "fdellity", "moanster"] | ||
found = ConceptnetNumberbatch.get_similar_words(targets[1], modelstuff...) | ||
for target in targets | ||
@time found = ConceptnetNumberbatch.get_similar_words(target, modelstuff...) | ||
println("$target => $(found[1])") | ||
end |
Oops, something went wrong.