diff --git a/.gitignore b/.gitignore index 2580acb..cf38443 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ README.md.temp data/ reports/ +resources/ wikimedia-dumps/ diff --git a/build.clj b/build.clj index 866a4a2..3da980e 100644 --- a/build.clj +++ b/build.clj @@ -48,3 +48,13 @@ (doto (io/make-parents)) (spit wd-props)))) +(def lexemes-dump-url + "https://dumps.wikimedia.org/wikidatawiki/entities/latest-lexemes.json.gz") + +(def lexemes-dump + (io/file "resources" "julesratte" "wikidata" "lexemes.json.gz")) + +(defn download-lexemes-dump + [& _] + (io/make-parents lexemes-dump) + (check-proc! ["curl" "-o" (str lexemes-dump) lexemes-dump-url])) diff --git a/deps.edn b/deps.edn index e950e70..940284f 100644 --- a/deps.edn +++ b/deps.edn @@ -1,4 +1,5 @@ -{:deps +{:paths ["src" "resources"] + :deps {org.clojure/clojure {:mvn/version "1.11.2"} org.clojure/data.zip {:mvn/version "1.1.0"} com.taoensso/timbre {:mvn/version "6.5.0"} diff --git a/src/julesratte/wikidata/lexemes.clj b/src/julesratte/wikidata/lexemes.clj new file mode 100644 index 0000000..3183d07 --- /dev/null +++ b/src/julesratte/wikidata/lexemes.clj @@ -0,0 +1,31 @@ +(ns julesratte.wikidata.lexemes + (:require + [clojure.java.io :as io] + [clojure.string :as str] + [julesratte.json :as json] + [taoensso.timbre :as log]) + (:import + (java.util.zip GZIPInputStream))) + +(def parse-dump-xf + (comp + (map str/trim) + (filter #(< 2 (count %))) + (map #(str/replace % #",$" "")) + (partition-all 32) + (mapcat (partial pmap json/read-value)))) + +(def dump-resource + (io/resource "julesratte/wikidata/lexemes.json.gz")) + +(defn -main + [& _] + (log/handle-uncaught-jvm-exceptions!) + (try + (with-open [stream (io/input-stream dump-resource) + stream (GZIPInputStream. stream) + reader (io/reader stream)] + (let [n (count (sequence parse-dump-xf (line-seq reader)))] + (log/infof "Lexemes: %,d" n))) + (finally + (shutdown-agents))))