Skip to content

Commit

Permalink
Adds parsing of Wikidata lexeme dumps
Browse files Browse the repository at this point in the history
  • Loading branch information
gremid committed Mar 14, 2024
1 parent eec1dc3 commit 7f94e51
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ README.md.temp

data/
reports/
resources/
wikimedia-dumps/
10 changes: 10 additions & 0 deletions build.clj
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,13 @@
(doto (io/make-parents))
(spit wd-props))))

(def lexemes-dump-url
"https://dumps.wikimedia.org/wikidatawiki/entities/latest-lexemes.json.gz")

(def lexemes-dump
(io/file "resources" "julesratte" "wikidata" "lexemes.json.gz"))

(defn download-lexemes-dump
[& _]
(io/make-parents lexemes-dump)
(check-proc! ["curl" "-o" (str lexemes-dump) lexemes-dump-url]))
3 changes: 2 additions & 1 deletion deps.edn
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{:deps
{:paths ["src" "resources"]
:deps
{org.clojure/clojure {:mvn/version "1.11.2"}
org.clojure/data.zip {:mvn/version "1.1.0"}
com.taoensso/timbre {:mvn/version "6.5.0"}
Expand Down
31 changes: 31 additions & 0 deletions src/julesratte/wikidata/lexemes.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
(ns julesratte.wikidata.lexemes
(:require
[clojure.java.io :as io]
[clojure.string :as str]
[julesratte.json :as json]
[taoensso.timbre :as log])
(:import
(java.util.zip GZIPInputStream)))

(def parse-dump-xf
(comp
(map str/trim)
(filter #(< 2 (count %)))
(map #(str/replace % #",$" ""))
(partition-all 32)
(mapcat (partial pmap json/read-value))))

(def dump-resource
(io/resource "julesratte/wikidata/lexemes.json.gz"))

(defn -main
[& _]
(log/handle-uncaught-jvm-exceptions!)
(try
(with-open [stream (io/input-stream dump-resource)
stream (GZIPInputStream. stream)
reader (io/reader stream)]
(let [n (count (sequence parse-dump-xf (line-seq reader)))]
(log/infof "Lexemes: %,d" n)))
(finally
(shutdown-agents))))

0 comments on commit 7f94e51

Please sign in to comment.