Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check pofile string delimiters #1151

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 42 additions & 10 deletions babel/messages/pofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from typing_extensions import Literal


_ESCAPED_CHARACTER_PATTERN = re.compile(r'\\([\\trn"])')

def unescape(string: str) -> str:
r"""Reverse `escape` the given string.

Expand All @@ -46,7 +48,7 @@ def replace_escapes(match):
return '\r'
# m is \ or "
return m
return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1])
return _ESCAPED_CHARACTER_PATTERN.sub(replace_escapes, string[1:-1])


def denormalize(string: str) -> str:
Expand Down Expand Up @@ -142,7 +144,8 @@ def __init__(self, *args: str) -> None:
self.append(arg)

def append(self, s: str) -> None:
self._strs.append(s.strip())
assert s[0] == '"' and s[-1] == '"'
rtobar marked this conversation as resolved.
Show resolved Hide resolved
self._strs.append(s)

def denormalize(self) -> str:
return ''.join(map(unescape, self._strs))
Expand Down Expand Up @@ -249,7 +252,7 @@ def _finish_current_message(self) -> None:
if self.messages:
if not self.translations:
self._invalid_pofile("", self.offset, f"missing msgstr for msgid '{self.messages[0].denormalize()}'")
self.translations.append([0, _NormalizedString("")])
self.translations.append([0, _NormalizedString()])
self._add_message()

def _process_message_line(self, lineno, line, obsolete=False) -> None:
Expand All @@ -276,6 +279,18 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None:

self.obsolete = obsolete

def _normalized_string(s):
# whole lines are already stripped on both ends
s = s.lstrip()
assert s, "string to normalize shouldn't be empty"
if s[0] != '"':
self._invalid_pofile(line, lineno, "String must be delimited on the left by double quotes")
s = '"' + s
if s[-1] != '"':
rtobar marked this conversation as resolved.
Show resolved Hide resolved
self._invalid_pofile(line, lineno, "String must be delimited on the right by double quotes")
s += '"'
return _NormalizedString(s)

# The line that has the msgid is stored as the offset of the msg
# should this be the msgctxt if it has one?
if keyword == 'msgid':
Expand All @@ -284,20 +299,33 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
if keyword in ['msgid', 'msgid_plural']:
self.in_msgctxt = False
self.in_msgid = True
self.messages.append(_NormalizedString(arg))
self.messages.append(_normalized_string(arg))

elif keyword == 'msgstr':
self.in_msgid = False
self.in_msgstr = True
if arg.startswith('['):
idx, msg = arg[1:].split(']', 1)
self.translations.append([int(idx), _NormalizedString(msg)])
arg = arg[1:]
if ']' not in arg:
self._invalid_pofile(line, lineno, "msgstr plural doesn't include a closing bracket")
# if it doesn't include ] it probablyd doesn't have a number either
arg = '0]' + arg
idx, msg = arg.split(']', 1)
try:
idx = int(idx)
except ValueError:
self._invalid_pofile(line, lineno, "msgstr plural's index is not a number")
idx = 0
if len(msg) < 2:
self._invalid_pofile(line, lineno, "msgstr plural doesn't have a message")
msg = '""'
self.translations.append([idx, _normalized_string(msg)])
else:
self.translations.append([0, _NormalizedString(arg)])
self.translations.append([0, _normalized_string(arg)])

elif keyword == 'msgctxt':
self.in_msgctxt = True
self.context = _NormalizedString(arg)
self.context = _normalized_string(arg)

def _process_string_continuation_line(self, line, lineno) -> None:
if self.in_msgid:
Expand All @@ -309,6 +337,10 @@ def _process_string_continuation_line(self, line, lineno) -> None:
else:
self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt")
return
assert line[0] == '"'
if line[-1] != '"':
self._invalid_pofile(line, lineno, "Continuation string must end with double quotes")
line += '"'
s.append(line)

def _process_comment(self, line) -> None:
Expand Down Expand Up @@ -366,8 +398,8 @@ def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
# No actual messages found, but there was some info in comments, from which
# we'll construct an empty header message
if not self.counter and (self.flags or self.user_comments or self.auto_comments):
self.messages.append(_NormalizedString('""'))
self.translations.append([0, _NormalizedString('""')])
self.messages.append(_NormalizedString())
self.translations.append([0, _NormalizedString()])
self._add_message()

def _invalid_pofile(self, line, lineno, msg) -> None:
Expand Down
48 changes: 37 additions & 11 deletions tests/messages/test_normalized_string.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,43 @@
import pytest

from babel.messages.pofile import _NormalizedString


def test_normalized_string():
ab1 = _NormalizedString('a', 'b ')
ab2 = _NormalizedString('a', ' b')
ac1 = _NormalizedString('a', 'c')
ac2 = _NormalizedString(' a', 'c ')
ab = _NormalizedString('"a"', '"b"')
ac = _NormalizedString('"a"', '"c"')
z = _NormalizedString()
assert ab1 == ab2 and ac1 == ac2 # __eq__
assert ab1 < ac1 # __lt__
assert ac1 > ab2 # __gt__
assert ac1 >= ac2 # __ge__
assert ab1 <= ab2 # __le__
assert ab1 != ac1 # __ne__
assert ab < ac # __lt__
assert ac > ab # __gt__
assert ab != ac # __ne__
assert not z # __nonzero__ / __bool__
assert sorted([ab1, ab2, ac1]) # the sort order is not stable so we can't really check it, just that we can sort
assert sorted([ac, ab, z]) == [z, ab, ac] # sorted() is stable


@pytest.mark.parametrize(
"original, denormalized",
(
('""', ""),
('"a"', "a"),
('"a\\n"', "a\n"),
('"a\\r"', "a\r"),
('"a\\t"', "a\t"),
('"a\\""', 'a"'),
('"a\\\\"', "a\\"),
),
)
def test_denormalized_simple_normalized_string(original, denormalized):
assert denormalized == _NormalizedString(original).denormalize()

@pytest.mark.parametrize(
"originals, denormalized",
(
(('"a"', '"b"'), "ab"),
(('"a"', '"b"'), "ab"),
(('"ab"', '""'), "ab"),
(('"ab"', '"c"'), "abc"),
(('"a"', '"bc"'), "abc"),
),
)
def test_denormalized_multi_normalized_string(originals, denormalized):
assert denormalized == _NormalizedString(*originals).denormalize()
75 changes: 57 additions & 18 deletions tests/messages/test_pofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ def test_with_location(self):


def test_abort_invalid_po_file(self):
# right double quote missing on msgstr, continuation missing both double quotes
invalid_po = '''
msgctxt ""
"{\"checksum\": 2148532640, \"cxt\": \"collector_thankyou\", \"id\": "
Expand All @@ -517,6 +518,7 @@ def test_abort_invalid_po_file(self):
Pour toute question, veuillez communiquer avec Fulano à nadie@blah.com
"
'''
# as above, but contains ascii-only characters
invalid_po_2 = '''
msgctxt ""
"{\"checksum\": 2148532640, \"cxt\": \"collector_thankyou\", \"id\": "
Expand All @@ -529,25 +531,62 @@ def test_abort_invalid_po_file(self):
Pour toute question, veuillez communiquer avec Fulano a fulano@blah.com
"
'''
# Catalog not created, throws Unicode Error
buf = StringIO(invalid_po)
output = pofile.read_po(buf, locale='fr', abort_invalid=False)
assert isinstance(output, Catalog)

# Catalog not created, throws PoFileError
buf = StringIO(invalid_po_2)
with pytest.raises(pofile.PoFileError):
pofile.read_po(buf, locale='fr', abort_invalid=True)

# Catalog is created with warning, no abort
buf = StringIO(invalid_po_2)
output = pofile.read_po(buf, locale='fr', abort_invalid=False)
assert isinstance(output, Catalog)
# left double quote missing in msgid
invalid_po_3 = '''
msgid *A"
msgstr "B"
'''
# right double quote missing in msgstr
invalid_po_4 = '''
msgid "A"
msgstr "B*
'''
# right double quote missing in msgstr continuation
invalid_po_5 = '''
msgid "A"
msgstr ""
"*
'''
for incorrectly_delimited_po_string in (invalid_po, invalid_po_2, invalid_po_3, invalid_po_4, invalid_po_5):
buf = StringIO(incorrectly_delimited_po_string)
with pytest.raises(pofile.PoFileError):
pofile.read_po(buf, locale='fr', abort_invalid=True)
buf.seek(0)
output = pofile.read_po(buf, locale='fr', abort_invalid=False)
assert isinstance(output, Catalog)

def test_invalid_msgstr_plural(self):
# msgstr plural broken
invalid_1 = '''
msgid "A"
msgstr[
'''
invalid_2 = '''
msgid "A"
msgstr[] "no index"
'''
invalid_3 = '''
msgid "A"
msgstr[bah] "index is not a number"
'''
# no content
invalid_4 = '''
msgid "A"
msgstr[0]
'''
# not correctly delimited
invalid_5 = '''
msgid "A"
msgstr[0] not delimited
'''
for incorrectly_plural in (invalid_1, invalid_2, invalid_3, invalid_4, invalid_5):
buf = StringIO(incorrectly_plural)
with pytest.raises(pofile.PoFileError):
pofile.read_po(buf, locale='fr', abort_invalid=True)
buf.seek(0)
output = pofile.read_po(buf, locale='fr', abort_invalid=False)
assert isinstance(output, Catalog)

# Catalog not created, aborted with PoFileError
buf = StringIO(invalid_po_2)
with pytest.raises(pofile.PoFileError):
pofile.read_po(buf, locale='fr', abort_invalid=True)

def test_invalid_pofile_with_abort_flag(self):
parser = pofile.PoFileParser(None, abort_invalid=True)
Expand Down