From 913feaac9c0db351536ce6c4317ede1230856383 Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Sun, 17 Nov 2024 23:10:53 +0800 Subject: [PATCH 1/7] Avoid re-compiling regular expression While the re module caches some of the latest compilations, it's better form to not rely on it doing so. Signed-off-by: Rodrigo Tobar --- babel/messages/pofile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 721707657..725fd7303 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -26,6 +26,8 @@ from typing_extensions import Literal +_ESCAPED_CHARACTER_PATTERN = re.compile(r'\\([\\trn"])') + def unescape(string: str) -> str: r"""Reverse `escape` the given string. @@ -46,7 +48,7 @@ def replace_escapes(match): return '\r' # m is \ or " return m - return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1]) + return _ESCAPED_CHARACTER_PATTERN.sub(replace_escapes, string[1:-1]) def denormalize(string: str) -> str: From 3647ae867f35049905f0c8efa741545e7ffffd2b Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Sun, 17 Nov 2024 11:08:55 +0800 Subject: [PATCH 2/7] Remove duplicate test assertion The exact same check is performed a few lines above. Signed-off-by: Rodrigo Tobar --- tests/messages/test_pofile.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index ed9da78f6..f0431f811 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -544,11 +544,6 @@ def test_abort_invalid_po_file(self): output = pofile.read_po(buf, locale='fr', abort_invalid=False) assert isinstance(output, Catalog) - # Catalog not created, aborted with PoFileError - buf = StringIO(invalid_po_2) - with pytest.raises(pofile.PoFileError): - pofile.read_po(buf, locale='fr', abort_invalid=True) - def test_invalid_pofile_with_abort_flag(self): parser = pofile.PoFileParser(None, abort_invalid=True) lineno = 10 From 5e84dc612bcff29ffd1f70ffc8803726bd3c6824 Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Sun, 17 Nov 2024 22:15:00 +0800 Subject: [PATCH 3/7] Perform full intended assertion in test Since Python 2.3 sorted() has been guaranteed to be stable. The comment was wrong, and thus it makes sense to do the full assertion as clearly intended. Signed-off-by: Rodrigo Tobar --- tests/messages/test_normalized_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/messages/test_normalized_string.py b/tests/messages/test_normalized_string.py index 9c95672b4..45ff2ad98 100644 --- a/tests/messages/test_normalized_string.py +++ b/tests/messages/test_normalized_string.py @@ -14,4 +14,4 @@ def test_normalized_string(): assert ab1 <= ab2 # __le__ assert ab1 != ac1 # __ne__ assert not z # __nonzero__ / __bool__ - assert sorted([ab1, ab2, ac1]) # the sort order is not stable so we can't really check it, just that we can sort + assert sorted([ab1, ab2, ac1]) == [ab1, ab2, ac1] # sorted() is stable From 663f97e68879a4742fab501c82cfdbc64e61ab4a Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Sun, 17 Nov 2024 22:32:56 +0800 Subject: [PATCH 4/7] Clarify usage of of _NormalizeString The _NormalizeString helper class mixes some responsibilities, not only acting as a container for potentially multiple lines of a single string message, but also doing and hiding some of the parsing of such strings. "Doing" because it performs a .strip() on all incoming strings in order to remove any whitespace before/after them, and "hiding" because when invoking the "denormalize" method, each line is slices to remove the first and last element, which are implicitly assumed to be the string delimiters (double quotes, in principle). These multiple roles have already led to confusion within the codebase as to how this class is supposed to be used. Its existing unit test doesn't provide strings with proper delimiters (and thus calling .denormalize() on these objects would return unexpected results -- empty strings in all cases). Similarly, missing msgstr instances also result in a call to _NormalizeString(""), which does work, but is conceptually incorrect, as the empty string is somethiing that _NormalizeString should never see coming in. This commit changes all the places where confusing usage of the _NormalizeString class happens. In particular, the existing unit test's strings are now always delimited by double quotes (so calling .denormalize on them would yield the expected value). A number of new unit tests have also been added exercising the denormalize() method, which includes unescaping escaped characters. Finally, the construction of an empty string message has been simplified to _NormalizeString(). Signed-off-by: Rodrigo Tobar --- babel/messages/pofile.py | 6 ++-- tests/messages/test_normalized_string.py | 39 +++++++++++++++++++++--- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 725fd7303..97a3722d8 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -251,7 +251,7 @@ def _finish_current_message(self) -> None: if self.messages: if not self.translations: self._invalid_pofile("", self.offset, f"missing msgstr for msgid '{self.messages[0].denormalize()}'") - self.translations.append([0, _NormalizedString("")]) + self.translations.append([0, _NormalizedString()]) self._add_message() def _process_message_line(self, lineno, line, obsolete=False) -> None: @@ -368,8 +368,8 @@ def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None: # No actual messages found, but there was some info in comments, from which # we'll construct an empty header message if not self.counter and (self.flags or self.user_comments or self.auto_comments): - self.messages.append(_NormalizedString('""')) - self.translations.append([0, _NormalizedString('""')]) + self.messages.append(_NormalizedString()) + self.translations.append([0, _NormalizedString()]) self._add_message() def _invalid_pofile(self, line, lineno, msg) -> None: diff --git a/tests/messages/test_normalized_string.py b/tests/messages/test_normalized_string.py index 45ff2ad98..735603770 100644 --- a/tests/messages/test_normalized_string.py +++ b/tests/messages/test_normalized_string.py @@ -1,11 +1,13 @@ +import pytest + from babel.messages.pofile import _NormalizedString def test_normalized_string(): - ab1 = _NormalizedString('a', 'b ') - ab2 = _NormalizedString('a', ' b') - ac1 = _NormalizedString('a', 'c') - ac2 = _NormalizedString(' a', 'c ') + ab1 = _NormalizedString('"a"', '"b" ') + ab2 = _NormalizedString('"a"', ' "b"') + ac1 = _NormalizedString('"a"', '"c"') + ac2 = _NormalizedString(' "a"', '"c" ') z = _NormalizedString() assert ab1 == ab2 and ac1 == ac2 # __eq__ assert ab1 < ac1 # __lt__ @@ -15,3 +17,32 @@ def test_normalized_string(): assert ab1 != ac1 # __ne__ assert not z # __nonzero__ / __bool__ assert sorted([ab1, ab2, ac1]) == [ab1, ab2, ac1] # sorted() is stable + + +@pytest.mark.parametrize( + "original, denormalized", + ( + ('""', ""), + ('"a"', "a"), + ('"a\\n"', "a\n"), + ('"a\\r"', "a\r"), + ('"a\\t"', "a\t"), + ('"a\\""', 'a"'), + ('"a\\\\"', "a\\"), + ), +) +def test_denormalized_simple_normalized_string(original, denormalized): + assert denormalized == _NormalizedString(original).denormalize() + +@pytest.mark.parametrize( + "originals, denormalized", + ( + (('"a"', '"b"'), "ab"), + (('"a" ', '"b"'), "ab"), + (('"ab"', '""'), "ab"), + (('"ab"', '"c"'), "abc"), + (('"a"', ' "bc"'), "abc"), + ), +) +def test_denormalized_multi_normalized_string(originals, denormalized): + assert denormalized == _NormalizedString(*originals).denormalize() From 5f09192809ac70706fce876f1ba62c6094811cf8 Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Tue, 17 Dec 2024 16:28:37 +0800 Subject: [PATCH 5/7] Add further msgstr plural checks --- babel/messages/pofile.py | 17 +++++++++++++++-- tests/messages/test_pofile.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 97a3722d8..d927ecf1c 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -292,8 +292,21 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None: self.in_msgid = False self.in_msgstr = True if arg.startswith('['): - idx, msg = arg[1:].split(']', 1) - self.translations.append([int(idx), _NormalizedString(msg)]) + arg = arg[1:] + if ']' not in arg: + self._invalid_pofile(line, lineno, "msgstr plural doesn't include a closing bracket") + # if it doesn't include ] it probablyd doesn't have a number either + arg = '0]' + arg + idx, msg = arg.split(']', 1) + try: + idx = int(idx) + except ValueError: + self._invalid_pofile(line, lineno, "msgstr plural's index is not a number") + idx = 0 + if len(msg) < 2: + self._invalid_pofile(line, lineno, "msgstr plural doesn't have a message") + msg = '""' + self.translations.append([idx, _NormalizedString(msg)]) else: self.translations.append([0, _NormalizedString(arg)]) diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index f0431f811..1c083fe76 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -544,6 +544,34 @@ def test_abort_invalid_po_file(self): output = pofile.read_po(buf, locale='fr', abort_invalid=False) assert isinstance(output, Catalog) + def test_invalid_msgstr_plural(self): + # msgstr plural broken + invalid_1 = ''' + msgid "A" + msgstr[ + ''' + invalid_2 = ''' + msgid "A" + msgstr[] "no index" + ''' + invalid_3 = ''' + msgid "A" + msgstr[bah] "index is not a number" + ''' + # no content + invalid_4 = ''' + msgid "A" + msgstr[0] + ''' + for incorrectly_plural in (invalid_1, invalid_2, invalid_3, invalid_4): + buf = StringIO(incorrectly_plural) + with pytest.raises(pofile.PoFileError): + pofile.read_po(buf, locale='fr', abort_invalid=True) + buf.seek(0) + output = pofile.read_po(buf, locale='fr', abort_invalid=False) + assert isinstance(output, Catalog) + + def test_invalid_pofile_with_abort_flag(self): parser = pofile.PoFileParser(None, abort_invalid=True) lineno = 10 From 00062fbe4b8619a6aaa29b16825a6d974f03a3ea Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Sun, 17 Nov 2024 11:06:03 +0800 Subject: [PATCH 6/7] Detect incorrectly delimited strings Strings should be delimited on both ends by double quotes, but this is currently not being been detected, and content is simply being incorrectly trimmed. This commit adds a check for each string to verify it starts and ends with a double quote character, issuing a warning/error if that's not the case (and fixing it as appropriate). A few new test cases have been added to check that the lack of double quotes to delimit strings issues errors as expected. Signed-off-by: Rodrigo Tobar --- babel/messages/pofile.py | 24 +++++++++++++++--- tests/messages/test_pofile.py | 46 +++++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 19 deletions(-) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index d927ecf1c..3d8eb8b05 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -278,6 +278,18 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None: self.obsolete = obsolete + def _normalized_string(s): + # whole lines are already stripped on both ends + s = s.lstrip() + assert s, "string to normalize shouldn't be empty" + if s[0] != '"': + self._invalid_pofile(line, lineno, "String must be delimited on the left by double quotes") + s = '"' + s + if s[-1] != '"': + self._invalid_pofile(line, lineno, "String must be delimited on the right by double quotes") + s += '"' + return _NormalizedString(s) + # The line that has the msgid is stored as the offset of the msg # should this be the msgctxt if it has one? if keyword == 'msgid': @@ -286,7 +298,7 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None: if keyword in ['msgid', 'msgid_plural']: self.in_msgctxt = False self.in_msgid = True - self.messages.append(_NormalizedString(arg)) + self.messages.append(_normalized_string(arg)) elif keyword == 'msgstr': self.in_msgid = False @@ -306,13 +318,13 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None: if len(msg) < 2: self._invalid_pofile(line, lineno, "msgstr plural doesn't have a message") msg = '""' - self.translations.append([idx, _NormalizedString(msg)]) + self.translations.append([idx, _normalized_string(msg)]) else: - self.translations.append([0, _NormalizedString(arg)]) + self.translations.append([0, _normalized_string(arg)]) elif keyword == 'msgctxt': self.in_msgctxt = True - self.context = _NormalizedString(arg) + self.context = _normalized_string(arg) def _process_string_continuation_line(self, line, lineno) -> None: if self.in_msgid: @@ -324,6 +336,10 @@ def _process_string_continuation_line(self, line, lineno) -> None: else: self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt") return + assert line[0] == '"' + if line[-1] != '"': + self._invalid_pofile(line, lineno, "Continuation string must end with double quotes") + line += '"' s.append(line) def _process_comment(self, line) -> None: diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index 1c083fe76..b875eb917 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -505,6 +505,7 @@ def test_with_location(self): def test_abort_invalid_po_file(self): + # right double quote missing on msgstr, continuation missing both double quotes invalid_po = ''' msgctxt "" "{\"checksum\": 2148532640, \"cxt\": \"collector_thankyou\", \"id\": " @@ -517,6 +518,7 @@ def test_abort_invalid_po_file(self): Pour toute question, veuillez communiquer avec Fulano à nadie@blah.com " ''' + # as above, but contains ascii-only characters invalid_po_2 = ''' msgctxt "" "{\"checksum\": 2148532640, \"cxt\": \"collector_thankyou\", \"id\": " @@ -529,20 +531,29 @@ def test_abort_invalid_po_file(self): Pour toute question, veuillez communiquer avec Fulano a fulano@blah.com " ''' - # Catalog not created, throws Unicode Error - buf = StringIO(invalid_po) - output = pofile.read_po(buf, locale='fr', abort_invalid=False) - assert isinstance(output, Catalog) - - # Catalog not created, throws PoFileError - buf = StringIO(invalid_po_2) - with pytest.raises(pofile.PoFileError): - pofile.read_po(buf, locale='fr', abort_invalid=True) - - # Catalog is created with warning, no abort - buf = StringIO(invalid_po_2) - output = pofile.read_po(buf, locale='fr', abort_invalid=False) - assert isinstance(output, Catalog) + # left double quote missing in msgid + invalid_po_3 = ''' + msgid *A" + msgstr "B" + ''' + # right double quote missing in msgstr + invalid_po_4 = ''' + msgid "A" + msgstr "B* + ''' + # right double quote missing in msgstr continuation + invalid_po_5 = ''' + msgid "A" + msgstr "" + "* + ''' + for incorrectly_delimited_po_string in (invalid_po, invalid_po_2, invalid_po_3, invalid_po_4, invalid_po_5): + buf = StringIO(incorrectly_delimited_po_string) + with pytest.raises(pofile.PoFileError): + pofile.read_po(buf, locale='fr', abort_invalid=True) + buf.seek(0) + output = pofile.read_po(buf, locale='fr', abort_invalid=False) + assert isinstance(output, Catalog) def test_invalid_msgstr_plural(self): # msgstr plural broken @@ -563,7 +574,12 @@ def test_invalid_msgstr_plural(self): msgid "A" msgstr[0] ''' - for incorrectly_plural in (invalid_1, invalid_2, invalid_3, invalid_4): + # not correctly delimited + invalid_5 = ''' + msgid "A" + msgstr[0] not delimited + ''' + for incorrectly_plural in (invalid_1, invalid_2, invalid_3, invalid_4, invalid_5): buf = StringIO(incorrectly_plural) with pytest.raises(pofile.PoFileError): pofile.read_po(buf, locale='fr', abort_invalid=True) From ff5213b54cad7be65bcf7112abea51a40c4d6517 Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Sun, 17 Nov 2024 23:05:01 +0800 Subject: [PATCH 7/7] Formalise constraint on _NormalizeString's inputs Now that all strings given as inputs to _NormalizeString have been verified (or corrected) to be correctly delimited with double quotes, there's no reason to continue doing an internal strip anymore. Moreover, we can express this internal constraint with an assertion to avoid issues in the future. Signed-off-by: Rodrigo Tobar --- babel/messages/pofile.py | 3 ++- tests/messages/test_normalized_string.py | 21 ++++++++------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 3d8eb8b05..4190d4104 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -144,7 +144,8 @@ def __init__(self, *args: str) -> None: self.append(arg) def append(self, s: str) -> None: - self._strs.append(s.strip()) + assert s[0] == '"' and s[-1] == '"' + self._strs.append(s) def denormalize(self) -> str: return ''.join(map(unescape, self._strs)) diff --git a/tests/messages/test_normalized_string.py b/tests/messages/test_normalized_string.py index 735603770..f398a1680 100644 --- a/tests/messages/test_normalized_string.py +++ b/tests/messages/test_normalized_string.py @@ -4,19 +4,14 @@ def test_normalized_string(): - ab1 = _NormalizedString('"a"', '"b" ') - ab2 = _NormalizedString('"a"', ' "b"') - ac1 = _NormalizedString('"a"', '"c"') - ac2 = _NormalizedString(' "a"', '"c" ') + ab = _NormalizedString('"a"', '"b"') + ac = _NormalizedString('"a"', '"c"') z = _NormalizedString() - assert ab1 == ab2 and ac1 == ac2 # __eq__ - assert ab1 < ac1 # __lt__ - assert ac1 > ab2 # __gt__ - assert ac1 >= ac2 # __ge__ - assert ab1 <= ab2 # __le__ - assert ab1 != ac1 # __ne__ + assert ab < ac # __lt__ + assert ac > ab # __gt__ + assert ab != ac # __ne__ assert not z # __nonzero__ / __bool__ - assert sorted([ab1, ab2, ac1]) == [ab1, ab2, ac1] # sorted() is stable + assert sorted([ac, ab, z]) == [z, ab, ac] # sorted() is stable @pytest.mark.parametrize( @@ -38,10 +33,10 @@ def test_denormalized_simple_normalized_string(original, denormalized): "originals, denormalized", ( (('"a"', '"b"'), "ab"), - (('"a" ', '"b"'), "ab"), + (('"a"', '"b"'), "ab"), (('"ab"', '""'), "ab"), (('"ab"', '"c"'), "abc"), - (('"a"', ' "bc"'), "abc"), + (('"a"', '"bc"'), "abc"), ), ) def test_denormalized_multi_normalized_string(originals, denormalized):