Skip to content

Commit

Permalink
truncate the XML to a <page> with a <title>
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed Jan 3, 2023
1 parent 6a99870 commit fa5e3cc
Showing 1 changed file with 29 additions and 0 deletions.
29 changes: 29 additions & 0 deletions wikiteam3/dumpgenerator/xml_truncate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,33 @@
from file_read_backwards import FileReadBackwards


def endsWithNewlines(filename: str) -> int:
"""Returns the number of newlines at the end of file"""

with FileReadBackwards(filename, encoding="utf-8") as frb:
newlines = 0
while frb.readline() == "":
newlines += 1
return newlines


def addNewline(filename: str) -> None:
"""Adds a newline to the end of file"""

print(f"Adding newline to end of {filename}")
with open(filename, "a", encoding="utf-8") as f:
f.write("\n")


def truncateXMLDump(filename: str) -> None:
"""Removes incomplete <page> elements from the end of XML dump files"""

with FileReadBackwards(filename, encoding="utf-8") as frb:
incomplete_segment: str = ""
xml_line: str = frb.readline()
while xml_line and "</title>" not in xml_line:
incomplete_segment += xml_line
xml_line = frb.readline()
while xml_line and "</page>" not in xml_line:
incomplete_segment += xml_line
xml_line = frb.readline()
Expand All @@ -27,3 +48,11 @@ def truncateXMLDump(filename: str) -> None:
+ str(file_size - incomplete_segment_size)
+ "), which would be illegal. Something is seriously wrong here!"
)

# add newline to prevent `</page> <page>` in one line
if endsWithNewlines(filename) == 0:
addNewline(filename)
elif endsWithNewlines(filename) > 1:
print(
f"WARNING: {filename} has {endsWithNewlines(filename)} newlines"
)

0 comments on commit fa5e3cc

Please sign in to comment.