-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathprepare-offsets.sh
executable file
·58 lines (42 loc) · 1.46 KB
/
prepare-offsets.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/bin/bash
# This script reads the given Wikipedia index to produce a file containing only its byte offsets (including the
# EOF byte offset), which can then be passed to process-wikidump.sh
#
# Params:
#
# ${1} Path to decompressed Wikipedia index file
# E.g., /path/to/enwiki-YYYYMMDD-pages-articles-multistream-index.txt
#
# ${2} Path to compressed Wikipedia data file
# E.g., /path/to/enwiki-YYYYMMDD-pages-articles-multistream.xml.bz2
#
# ${3} (Optional) Path of file to be created, to receive the list of offsets
# E.g., /path/to/target-offsets.txt
# Default: ${1}.offsets
#
function fatal() {
echo "Fatal: ${1}" >&2
exit "${2:-1}"
}
function configure() {
readonly INDEX_FILE="${1}"
[[ ! -f "${INDEX_FILE}" ]] && fatal "Index file not found: ${INDEX_FILE}"
[[ ! -f "${2}" ]] && fatal "Data file not found: ${2}"
readonly DATA_FILE_SIZE="$(stat --printf="%s" "${2}")"
readonly OFFSETS_FILE="${3:-"${INDEX_FILE}.offsets"}"
local offsets_file_dir="$( dirname "${OFFSETS_FILE}" )"
if [[ ! -d "${offsets_file_dir}" ]] ; then
mkdir "${offsets_file_dir}" || fatal "Cannot access ${offsets_file_dir}"
fi
return 0
}
function main() {
# Create the offsets file
cat "${INDEX_FILE}" | cut -d':' -f1 | uniq > "${OFFSETS_FILE}" || fatal "Failed to write '${INDEX_FILE}'"
# Append the ending byte offset (EOF) for the final stream...
echo "${DATA_FILE_SIZE}" >> "${OFFSETS_FILE}"
return 0
}
configure "$@"
main
exit 0