-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract
executable file
·42 lines (30 loc) · 1.44 KB
/
extract
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
set -ex
# Overall script to extract the stuff. The final output is in extracted.txt
if [ ! -f zhwiki-latest-categorylinks.sql ]; then
curl -O https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-categorylinks.sql.gz
gunzip -k zhwiki-latest-categorylinks.sql.gz
fi
if [ ! -f zhwiki-latest-page.sql ]; then
curl -O https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-page.sql.gz
gunzip -k zhwiki-latest-page.sql.gz
fi
# rm -vf zhwiki.sqlite3 # clear derived file
if [ ! -f zhwiki.sqlite3 ]; then
./hack-mangle.py zhwiki-latest-categorylinks.sql categorylinks
sqlite3 zhwiki.sqlite3 < zhwiki-latest-categorylinks.sql.fixed
./hack-mangle.py zhwiki-latest-page.sql page
sqlite3 zhwiki.sqlite3 < zhwiki-latest-page.sql.fixed
echo 'CREATE INDEX "category_from" ON categorylinks (cl_from);' | sqlite3 zhwiki.sqlite3
echo 'CREATE INDEX "category_to" ON categorylinks (cl_to);' | sqlite3 zhwiki.sqlite3
fi
set +x
for ((YYYY=1900;YYYY<=2022;YYYY++)); do
echo $YYYY >&2
cat query.sql | sed 's/YYYY/'$YYYY'/g' | sqlite3 zhwiki.sqlite3
done | tee temp_names.txt
# 生年不详 <- simplified intended, long live zhwiki.
cat query.sql | sed 's/YYYY年出生/生年不详/' | sqlite3 zhwiki.sqlite3 | tee -a temp_names.txt
cat query.sql | sed 's/YYYY年出生/在世人物/' | sqlite3 zhwiki.sqlite3 | tee -a temp_names.txt
# On macOS the sort tries to be smart...
LC_ALL=C sort temp_names.txt | uniq > extracted.txt