Skip to content

Commit

Permalink
Merge pull request #6 from zgornel/latest
Browse files Browse the repository at this point in the history
Improvements
  • Loading branch information
zgornel authored Oct 23, 2018
2 parents 3cb094c + 6b24ae5 commit dc69f9c
Show file tree
Hide file tree
Showing 12 changed files with 362 additions and 208 deletions.
28 changes: 8 additions & 20 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ version = "0.8.10"

[[BinaryProvider]]
deps = ["Libdl", "Pkg", "SHA", "Test"]
git-tree-sha1 = "48c147e63431adbcee69bc40b04c3f0fec0a4982"
git-tree-sha1 = "9930c1a6cd49d9fcd7218df6be417e6ae4f1468a"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.0"
version = "0.5.2"

[[Blosc]]
deps = ["BinaryProvider", "CMakeWrapper", "Compat", "Libdl"]
Expand All @@ -27,9 +27,9 @@ version = "1.0.0"

[[CMake]]
deps = ["BinDeps", "Libdl", "Pkg", "Test"]
git-tree-sha1 = "4f0b34e12d4d2c6a367d62c73c961ad468d62b7b"
git-tree-sha1 = "74853a75c26a4a73ac391ee26ee29ebeb5583d9f"
uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a"
version = "1.0.2"
version = "1.1.0"

[[CMakeWrapper]]
deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Pkg", "Test"]
Expand All @@ -45,9 +45,9 @@ version = "0.5.0"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "ff2595695fc4f14427358ce2593f867085c45dcb"
git-tree-sha1 = "2d9e14d19bad3f9ad5cc5e4cffabc3cfa59de825"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "1.2.0"
version = "1.3.0"

[[Dates]]
deps = ["Printf"]
Expand Down Expand Up @@ -91,9 +91,9 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[IterTools]]
deps = ["Pkg", "SparseArrays", "Test"]
git-tree-sha1 = "ed0787e62dc46b8d8c7c3db54391d71e0da5fefd"
git-tree-sha1 = "79246285c43602384e6f1943b3554042a3712056"
uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
version = "1.0.0"
version = "1.1.1"

[[JSON]]
deps = ["Dates", "Distributed", "Mmap", "Pkg", "Sockets", "Test", "Unicode"]
Expand Down Expand Up @@ -145,12 +145,6 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"

[[NearestNeighbors]]
deps = ["Distances", "LinearAlgebra", "Mmap", "Pkg", "StaticArrays", "Test"]
git-tree-sha1 = "aab46b96ae5c2a9c08146188016d6312276094e5"
uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
version = "0.4.2"

[[OrderedCollections]]
deps = ["Pkg", "Random", "Serialization", "Test"]
git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
Expand Down Expand Up @@ -196,12 +190,6 @@ uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[[StaticArrays]]
deps = ["InteractiveUtils", "LinearAlgebra", "Random", "Statistics", "Test"]
git-tree-sha1 = "d432c79bef174a830304f8601427a4357dfdbfb7"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "0.8.3"

[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand Down
1 change: 0 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ version = "0.1.0"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ julia> conceptnet = load_embeddings(file_conceptnet, languages=:en)
# ConceptNet{Languages.English} (compressed): 1 language(s), 150875 embeddings

julia> conceptnet["apple"] # Get embeddings for a single word
# 300×1 Array{Int8,2}:
# 300-element Array{Int8,1}:
# 0
# 0
# 1
Expand Down Expand Up @@ -98,11 +98,27 @@ julia> # `keys` returns an iterator for all words
# couvents
```

Document embedding is quite straightforward:
```julia
julia> doc = "embed this document containing X_#-s231 which cannot be embedded"
edoc, idxs_missed = embed_document(conceptnet, doc, language=Languages.English(), keep_size=false)
missed_words = tokenize_for_conceptnet(doc)[idx_missed]
println("Missed word: $missed_word")
edoc
# Missed word: SubString{String}["X_#-s231"]
# 300×8 Array{Int8,2}:
# 0 0 0 0 0 1 0 0
# -1 -2 -1 -1 -3 -2 -3 0
# 1 5 0 4 6 6 6 2
# ...
```


## Remarks

- fast for retrieving embeddings of exact matches
- fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`)
- fast document embedding
- if neither exact or wildcard matches exist, retrieval can be based on string distances (slow, see `src/search.jl`)
- for another package handling word embeddings, check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl)

Expand Down
1 change: 0 additions & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ CodecZlib
HDF5
Languages
StringDistances
NearestNeighbors
59 changes: 4 additions & 55 deletions src/ConceptnetNumberbatch.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,72 +25,21 @@ using CodecZlib
using HDF5
using Languages
using StringDistances
using NearestNeighbors

import Base: get, getindex, size, length, show, keys, values, in

# Links pointing to the latest ConceptNetNumberbatch version (v"17.06")
const CONCEPTNET_MULTI_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz"
const CONCEPTNET_EN_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz"
const CONCEPTNET_HDF5_LINK = "https://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5"

# Accepted languages (map from conceptnet to Languages.Language)
const LANGUAGES = Dict(:en=>Languages.English(),
:fr=>Languages.French(),
:de=>Languages.German(),
:it=>Languages.Italian(),
:fi=>Languages.Finnish(),
:nl=>Languages.Dutch(),
:af=>Languages.Dutch(),
:pt=>Languages.Portuguese(),
:es=>Languages.Spanish(),
:ru=>Languages.Russian(),
:sh=>Languages.Serbian(),# and Languages.Croatian()
:sw=>Languages.Swedish(),
:cs=>Languages.Czech(),
:pl=>Languages.Polish(),
:bg=>Languages.Bulgarian(),
:eo=>Languages.Esperanto(),
:hu=>Languages.Hungarian(),
:el=>Languages.Greek(),
:no=>Languages.Nynorsk(),
:sl=>Languages.Slovene(),
:ro=>Languages.Romanian(),
:vi=>Languages.Vietnamese(),
:lv=>Languages.Latvian(),
:tr=>Languages.Turkish(),
:da=>Languages.Danish(),
:ar=>Languages.Arabic(),
:fa=>Languages.Persian(),
:ko=>Languages.Korean(),
:th=>Languages.Thai(),
:ka=>Languages.Georgian(),
:he=>Languages.Hebrew(),
:te=>Languages.Telugu(),
:et=>Languages.Estonian(),
:hi=>Languages.Hindi(),
:lt=>Languages.Lithuanian(),
:uk=>Languages.Ukrainian(),
:be=>Languages.Belarusian(),
:sw=>Languages.Swahili(),
:ur=>Languages.Urdu(),
:ku=>Languages.Kurdish(),
:az=>Languages.Azerbaijani(),
:ta=>Languages.Tamil()
# add more mappings here if needed
# AND supported by Languages.jl
)

export CONCEPTNET_MULTI_LINK,
CONCEPTNET_EN_LINK,
CONCEPTNET_HDF5_LINK,
ConceptNet,
download_embeddings,
load_embeddings,
phrase_embeddings
tokenize_for_conceptnet,
embed_document

include("defaults.jl")
include("interface.jl")
include("files.jl")
include("search.jl")
include("document_embeddings.jl")

end # module
54 changes: 54 additions & 0 deletions src/defaults.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Links pointing to the latest ConceptNetNumberbatch version (v"17.06")
const CONCEPTNET_MULTI_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz"
const CONCEPTNET_EN_LINK = "https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz"
const CONCEPTNET_HDF5_LINK = "https://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5"

# Accepted languages (map from conceptnet to Languages.Language)
const LANGUAGES = Dict(:en=>Languages.English(),
:fr=>Languages.French(),
:de=>Languages.German(),
:it=>Languages.Italian(),
:fi=>Languages.Finnish(),
:nl=>Languages.Dutch(),
:af=>Languages.Dutch(),
:pt=>Languages.Portuguese(),
:es=>Languages.Spanish(),
:ru=>Languages.Russian(),
:sh=>Languages.Serbian(),# and Languages.Croatian()
:sw=>Languages.Swedish(),
:cs=>Languages.Czech(),
:pl=>Languages.Polish(),
:bg=>Languages.Bulgarian(),
:eo=>Languages.Esperanto(),
:hu=>Languages.Hungarian(),
:el=>Languages.Greek(),
:no=>Languages.Nynorsk(),
:sl=>Languages.Slovene(),
:ro=>Languages.Romanian(),
:vi=>Languages.Vietnamese(),
:lv=>Languages.Latvian(),
:tr=>Languages.Turkish(),
:da=>Languages.Danish(),
:ar=>Languages.Arabic(),
:fa=>Languages.Persian(),
:ko=>Languages.Korean(),
:th=>Languages.Thai(),
:ka=>Languages.Georgian(),
:he=>Languages.Hebrew(),
:te=>Languages.Telugu(),
:et=>Languages.Estonian(),
:hi=>Languages.Hindi(),
:lt=>Languages.Lithuanian(),
:uk=>Languages.Ukrainian(),
:be=>Languages.Belarusian(),
:sw=>Languages.Swahili(),
:ur=>Languages.Urdu(),
:ku=>Languages.Kurdish(),
:az=>Languages.Azerbaijani(),
:ta=>Languages.Tamil()
# add more mappings here if needed
# AND supported by Languages.jl
)

# Regular expression on which to split text into tokens
const DEFAULT_SPLITTER = r"(,|:|\\|\/|;|\.|\[|\]|\{|\}|\"|\"|\s+)"
Loading

0 comments on commit dc69f9c

Please sign in to comment.