diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 721707657..4190d4104 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -26,6 +26,8 @@ from typing_extensions import Literal +_ESCAPED_CHARACTER_PATTERN = re.compile(r'\\([\\trn"])') + def unescape(string: str) -> str: r"""Reverse `escape` the given string. @@ -46,7 +48,7 @@ def replace_escapes(match): return '\r' # m is \ or " return m - return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1]) + return _ESCAPED_CHARACTER_PATTERN.sub(replace_escapes, string[1:-1]) def denormalize(string: str) -> str: @@ -142,7 +144,8 @@ def __init__(self, *args: str) -> None: self.append(arg) def append(self, s: str) -> None: - self._strs.append(s.strip()) + assert s[0] == '"' and s[-1] == '"' + self._strs.append(s) def denormalize(self) -> str: return ''.join(map(unescape, self._strs)) @@ -249,7 +252,7 @@ def _finish_current_message(self) -> None: if self.messages: if not self.translations: self._invalid_pofile("", self.offset, f"missing msgstr for msgid '{self.messages[0].denormalize()}'") - self.translations.append([0, _NormalizedString("")]) + self.translations.append([0, _NormalizedString()]) self._add_message() def _process_message_line(self, lineno, line, obsolete=False) -> None: @@ -276,6 +279,18 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None: self.obsolete = obsolete + def _normalized_string(s): + # whole lines are already stripped on both ends + s = s.lstrip() + assert s, "string to normalize shouldn't be empty" + if s[0] != '"': + self._invalid_pofile(line, lineno, "String must be delimited on the left by double quotes") + s = '"' + s + if s[-1] != '"': + self._invalid_pofile(line, lineno, "String must be delimited on the right by double quotes") + s += '"' + return _NormalizedString(s) + # The line that has the msgid is stored as the offset of the msg # should this be the msgctxt if it has one? if keyword == 'msgid': @@ -284,20 +299,33 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None: if keyword in ['msgid', 'msgid_plural']: self.in_msgctxt = False self.in_msgid = True - self.messages.append(_NormalizedString(arg)) + self.messages.append(_normalized_string(arg)) elif keyword == 'msgstr': self.in_msgid = False self.in_msgstr = True if arg.startswith('['): - idx, msg = arg[1:].split(']', 1) - self.translations.append([int(idx), _NormalizedString(msg)]) + arg = arg[1:] + if ']' not in arg: + self._invalid_pofile(line, lineno, "msgstr plural doesn't include a closing bracket") + # if it doesn't include ] it probablyd doesn't have a number either + arg = '0]' + arg + idx, msg = arg.split(']', 1) + try: + idx = int(idx) + except ValueError: + self._invalid_pofile(line, lineno, "msgstr plural's index is not a number") + idx = 0 + if len(msg) < 2: + self._invalid_pofile(line, lineno, "msgstr plural doesn't have a message") + msg = '""' + self.translations.append([idx, _normalized_string(msg)]) else: - self.translations.append([0, _NormalizedString(arg)]) + self.translations.append([0, _normalized_string(arg)]) elif keyword == 'msgctxt': self.in_msgctxt = True - self.context = _NormalizedString(arg) + self.context = _normalized_string(arg) def _process_string_continuation_line(self, line, lineno) -> None: if self.in_msgid: @@ -309,6 +337,10 @@ def _process_string_continuation_line(self, line, lineno) -> None: else: self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt") return + assert line[0] == '"' + if line[-1] != '"': + self._invalid_pofile(line, lineno, "Continuation string must end with double quotes") + line += '"' s.append(line) def _process_comment(self, line) -> None: @@ -366,8 +398,8 @@ def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None: # No actual messages found, but there was some info in comments, from which # we'll construct an empty header message if not self.counter and (self.flags or self.user_comments or self.auto_comments): - self.messages.append(_NormalizedString('""')) - self.translations.append([0, _NormalizedString('""')]) + self.messages.append(_NormalizedString()) + self.translations.append([0, _NormalizedString()]) self._add_message() def _invalid_pofile(self, line, lineno, msg) -> None: diff --git a/tests/messages/test_normalized_string.py b/tests/messages/test_normalized_string.py index 9c95672b4..f398a1680 100644 --- a/tests/messages/test_normalized_string.py +++ b/tests/messages/test_normalized_string.py @@ -1,17 +1,43 @@ +import pytest + from babel.messages.pofile import _NormalizedString def test_normalized_string(): - ab1 = _NormalizedString('a', 'b ') - ab2 = _NormalizedString('a', ' b') - ac1 = _NormalizedString('a', 'c') - ac2 = _NormalizedString(' a', 'c ') + ab = _NormalizedString('"a"', '"b"') + ac = _NormalizedString('"a"', '"c"') z = _NormalizedString() - assert ab1 == ab2 and ac1 == ac2 # __eq__ - assert ab1 < ac1 # __lt__ - assert ac1 > ab2 # __gt__ - assert ac1 >= ac2 # __ge__ - assert ab1 <= ab2 # __le__ - assert ab1 != ac1 # __ne__ + assert ab < ac # __lt__ + assert ac > ab # __gt__ + assert ab != ac # __ne__ assert not z # __nonzero__ / __bool__ - assert sorted([ab1, ab2, ac1]) # the sort order is not stable so we can't really check it, just that we can sort + assert sorted([ac, ab, z]) == [z, ab, ac] # sorted() is stable + + +@pytest.mark.parametrize( + "original, denormalized", + ( + ('""', ""), + ('"a"', "a"), + ('"a\\n"', "a\n"), + ('"a\\r"', "a\r"), + ('"a\\t"', "a\t"), + ('"a\\""', 'a"'), + ('"a\\\\"', "a\\"), + ), +) +def test_denormalized_simple_normalized_string(original, denormalized): + assert denormalized == _NormalizedString(original).denormalize() + +@pytest.mark.parametrize( + "originals, denormalized", + ( + (('"a"', '"b"'), "ab"), + (('"a"', '"b"'), "ab"), + (('"ab"', '""'), "ab"), + (('"ab"', '"c"'), "abc"), + (('"a"', '"bc"'), "abc"), + ), +) +def test_denormalized_multi_normalized_string(originals, denormalized): + assert denormalized == _NormalizedString(*originals).denormalize() diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index ed9da78f6..b875eb917 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -505,6 +505,7 @@ def test_with_location(self): def test_abort_invalid_po_file(self): + # right double quote missing on msgstr, continuation missing both double quotes invalid_po = ''' msgctxt "" "{\"checksum\": 2148532640, \"cxt\": \"collector_thankyou\", \"id\": " @@ -517,6 +518,7 @@ def test_abort_invalid_po_file(self): Pour toute question, veuillez communiquer avec Fulano à nadie@blah.com " ''' + # as above, but contains ascii-only characters invalid_po_2 = ''' msgctxt "" "{\"checksum\": 2148532640, \"cxt\": \"collector_thankyou\", \"id\": " @@ -529,25 +531,62 @@ def test_abort_invalid_po_file(self): Pour toute question, veuillez communiquer avec Fulano a fulano@blah.com " ''' - # Catalog not created, throws Unicode Error - buf = StringIO(invalid_po) - output = pofile.read_po(buf, locale='fr', abort_invalid=False) - assert isinstance(output, Catalog) - - # Catalog not created, throws PoFileError - buf = StringIO(invalid_po_2) - with pytest.raises(pofile.PoFileError): - pofile.read_po(buf, locale='fr', abort_invalid=True) - - # Catalog is created with warning, no abort - buf = StringIO(invalid_po_2) - output = pofile.read_po(buf, locale='fr', abort_invalid=False) - assert isinstance(output, Catalog) + # left double quote missing in msgid + invalid_po_3 = ''' + msgid *A" + msgstr "B" + ''' + # right double quote missing in msgstr + invalid_po_4 = ''' + msgid "A" + msgstr "B* + ''' + # right double quote missing in msgstr continuation + invalid_po_5 = ''' + msgid "A" + msgstr "" + "* + ''' + for incorrectly_delimited_po_string in (invalid_po, invalid_po_2, invalid_po_3, invalid_po_4, invalid_po_5): + buf = StringIO(incorrectly_delimited_po_string) + with pytest.raises(pofile.PoFileError): + pofile.read_po(buf, locale='fr', abort_invalid=True) + buf.seek(0) + output = pofile.read_po(buf, locale='fr', abort_invalid=False) + assert isinstance(output, Catalog) + + def test_invalid_msgstr_plural(self): + # msgstr plural broken + invalid_1 = ''' + msgid "A" + msgstr[ + ''' + invalid_2 = ''' + msgid "A" + msgstr[] "no index" + ''' + invalid_3 = ''' + msgid "A" + msgstr[bah] "index is not a number" + ''' + # no content + invalid_4 = ''' + msgid "A" + msgstr[0] + ''' + # not correctly delimited + invalid_5 = ''' + msgid "A" + msgstr[0] not delimited + ''' + for incorrectly_plural in (invalid_1, invalid_2, invalid_3, invalid_4, invalid_5): + buf = StringIO(incorrectly_plural) + with pytest.raises(pofile.PoFileError): + pofile.read_po(buf, locale='fr', abort_invalid=True) + buf.seek(0) + output = pofile.read_po(buf, locale='fr', abort_invalid=False) + assert isinstance(output, Catalog) - # Catalog not created, aborted with PoFileError - buf = StringIO(invalid_po_2) - with pytest.raises(pofile.PoFileError): - pofile.read_po(buf, locale='fr', abort_invalid=True) def test_invalid_pofile_with_abort_flag(self): parser = pofile.PoFileParser(None, abort_invalid=True)