From 913feaac9c0db351536ce6c4317ede1230856383 Mon Sep 17 00:00:00 2001
From: Rodrigo Tobar <rtobar@icrar.org>
Date: Sun, 17 Nov 2024 23:10:53 +0800
Subject: [PATCH 1/7] Avoid re-compiling regular expression

While the re module caches some of the latest compilations, it's better
form to not rely on it doing so.

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
---
 babel/messages/pofile.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index 721707657..725fd7303 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -26,6 +26,8 @@
     from typing_extensions import Literal
 
 
+_ESCAPED_CHARACTER_PATTERN = re.compile(r'\\([\\trn"])')
+
 def unescape(string: str) -> str:
     r"""Reverse `escape` the given string.
 
@@ -46,7 +48,7 @@ def replace_escapes(match):
             return '\r'
         # m is \ or "
         return m
-    return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1])
+    return _ESCAPED_CHARACTER_PATTERN.sub(replace_escapes, string[1:-1])
 
 
 def denormalize(string: str) -> str:

From 3647ae867f35049905f0c8efa741545e7ffffd2b Mon Sep 17 00:00:00 2001
From: Rodrigo Tobar <rtobar@icrar.org>
Date: Sun, 17 Nov 2024 11:08:55 +0800
Subject: [PATCH 2/7] Remove duplicate test assertion

The exact same check is performed a few lines above.

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
---
 tests/messages/test_pofile.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py
index ed9da78f6..f0431f811 100644
--- a/tests/messages/test_pofile.py
+++ b/tests/messages/test_pofile.py
@@ -544,11 +544,6 @@ def test_abort_invalid_po_file(self):
         output = pofile.read_po(buf, locale='fr', abort_invalid=False)
         assert isinstance(output, Catalog)
 
-        # Catalog not created, aborted with PoFileError
-        buf = StringIO(invalid_po_2)
-        with pytest.raises(pofile.PoFileError):
-            pofile.read_po(buf, locale='fr', abort_invalid=True)
-
     def test_invalid_pofile_with_abort_flag(self):
         parser = pofile.PoFileParser(None, abort_invalid=True)
         lineno = 10

From 5e84dc612bcff29ffd1f70ffc8803726bd3c6824 Mon Sep 17 00:00:00 2001
From: Rodrigo Tobar <rtobar@icrar.org>
Date: Sun, 17 Nov 2024 22:15:00 +0800
Subject: [PATCH 3/7] Perform full intended assertion in test

Since Python 2.3 sorted() has been guaranteed to be stable. The comment
was wrong, and thus it makes sense to do the full assertion as clearly
intended.

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
---
 tests/messages/test_normalized_string.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/messages/test_normalized_string.py b/tests/messages/test_normalized_string.py
index 9c95672b4..45ff2ad98 100644
--- a/tests/messages/test_normalized_string.py
+++ b/tests/messages/test_normalized_string.py
@@ -14,4 +14,4 @@ def test_normalized_string():
     assert ab1 <= ab2  # __le__
     assert ab1 != ac1  # __ne__
     assert not z  # __nonzero__ / __bool__
-    assert sorted([ab1, ab2, ac1])  # the sort order is not stable so we can't really check it, just that we can sort
+    assert sorted([ab1, ab2, ac1]) == [ab1, ab2, ac1] # sorted() is stable

From 663f97e68879a4742fab501c82cfdbc64e61ab4a Mon Sep 17 00:00:00 2001
From: Rodrigo Tobar <rtobar@icrar.org>
Date: Sun, 17 Nov 2024 22:32:56 +0800
Subject: [PATCH 4/7] Clarify usage of of _NormalizeString

The _NormalizeString helper class mixes some responsibilities, not only
acting as a container for potentially multiple lines of a single string
message, but also doing and hiding some of the parsing of such strings.
"Doing" because it performs a .strip() on all incoming strings in order
to remove any whitespace before/after them, and "hiding" because when
invoking the "denormalize" method, each line is slices to remove the
first and last element, which are implicitly assumed to be the string
delimiters (double quotes, in principle).

These multiple roles have already led to confusion within the codebase
as to how this class is supposed to be used. Its existing unit test
doesn't provide strings with proper delimiters (and thus calling
.denormalize() on these objects would return unexpected results -- empty
strings in all cases). Similarly, missing msgstr instances also result
in a call to _NormalizeString(""), which does work, but is conceptually
incorrect, as the empty string is somethiing that _NormalizeString
should never see coming in.

This commit changes all the places where confusing usage of the
_NormalizeString class happens. In particular, the existing unit test's
strings are now always delimited by double quotes (so calling
.denormalize on them would yield the expected value). A number of new
unit tests have also been added exercising the denormalize() method,
which includes unescaping escaped characters. Finally, the construction
of an empty string message has been simplified to _NormalizeString().

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
---
 babel/messages/pofile.py                 |  6 ++--
 tests/messages/test_normalized_string.py | 39 +++++++++++++++++++++---
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index 725fd7303..97a3722d8 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -251,7 +251,7 @@ def _finish_current_message(self) -> None:
         if self.messages:
             if not self.translations:
                 self._invalid_pofile("", self.offset, f"missing msgstr for msgid '{self.messages[0].denormalize()}'")
-                self.translations.append([0, _NormalizedString("")])
+                self.translations.append([0, _NormalizedString()])
             self._add_message()
 
     def _process_message_line(self, lineno, line, obsolete=False) -> None:
@@ -368,8 +368,8 @@ def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
         # No actual messages found, but there was some info in comments, from which
         # we'll construct an empty header message
         if not self.counter and (self.flags or self.user_comments or self.auto_comments):
-            self.messages.append(_NormalizedString('""'))
-            self.translations.append([0, _NormalizedString('""')])
+            self.messages.append(_NormalizedString())
+            self.translations.append([0, _NormalizedString()])
             self._add_message()
 
     def _invalid_pofile(self, line, lineno, msg) -> None:
diff --git a/tests/messages/test_normalized_string.py b/tests/messages/test_normalized_string.py
index 45ff2ad98..735603770 100644
--- a/tests/messages/test_normalized_string.py
+++ b/tests/messages/test_normalized_string.py
@@ -1,11 +1,13 @@
+import pytest
+
 from babel.messages.pofile import _NormalizedString
 
 
 def test_normalized_string():
-    ab1 = _NormalizedString('a', 'b ')
-    ab2 = _NormalizedString('a', ' b')
-    ac1 = _NormalizedString('a', 'c')
-    ac2 = _NormalizedString('  a', 'c  ')
+    ab1 = _NormalizedString('"a"', '"b" ')
+    ab2 = _NormalizedString('"a"', ' "b"')
+    ac1 = _NormalizedString('"a"', '"c"')
+    ac2 = _NormalizedString('  "a"', '"c"  ')
     z = _NormalizedString()
     assert ab1 == ab2 and ac1 == ac2  # __eq__
     assert ab1 < ac1  # __lt__
@@ -15,3 +17,32 @@ def test_normalized_string():
     assert ab1 != ac1  # __ne__
     assert not z  # __nonzero__ / __bool__
     assert sorted([ab1, ab2, ac1]) == [ab1, ab2, ac1] # sorted() is stable
+
+
+@pytest.mark.parametrize(
+    "original, denormalized",
+    (
+        ('""', ""),
+        ('"a"', "a"),
+        ('"a\\n"', "a\n"),
+        ('"a\\r"', "a\r"),
+        ('"a\\t"', "a\t"),
+        ('"a\\""', 'a"'),
+        ('"a\\\\"', "a\\"),
+    ),
+)
+def test_denormalized_simple_normalized_string(original, denormalized):
+    assert denormalized == _NormalizedString(original).denormalize()
+
+@pytest.mark.parametrize(
+    "originals, denormalized",
+    (
+        (('"a"', '"b"'), "ab"),
+        (('"a" ', '"b"'), "ab"),
+        (('"ab"', '""'), "ab"),
+        (('"ab"', '"c"'), "abc"),
+        (('"a"', '     "bc"'), "abc"),
+    ),
+)
+def test_denormalized_multi_normalized_string(originals, denormalized):
+    assert denormalized == _NormalizedString(*originals).denormalize()

From 5f09192809ac70706fce876f1ba62c6094811cf8 Mon Sep 17 00:00:00 2001
From: Rodrigo Tobar <rtobar@icrar.org>
Date: Tue, 17 Dec 2024 16:28:37 +0800
Subject: [PATCH 5/7] Add further msgstr plural checks

---
 babel/messages/pofile.py      | 17 +++++++++++++++--
 tests/messages/test_pofile.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index 97a3722d8..d927ecf1c 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -292,8 +292,21 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
             self.in_msgid = False
             self.in_msgstr = True
             if arg.startswith('['):
-                idx, msg = arg[1:].split(']', 1)
-                self.translations.append([int(idx), _NormalizedString(msg)])
+                arg = arg[1:]
+                if ']' not in arg:
+                    self._invalid_pofile(line, lineno, "msgstr plural doesn't include a closing bracket")
+                    # if it doesn't include ] it probablyd doesn't have a number either
+                    arg = '0]' + arg
+                idx, msg = arg.split(']', 1)
+                try:
+                    idx = int(idx)
+                except ValueError:
+                    self._invalid_pofile(line, lineno, "msgstr plural's index is not a number")
+                    idx = 0
+                if len(msg) < 2:
+                    self._invalid_pofile(line, lineno, "msgstr plural doesn't have a message")
+                    msg = '""'
+                self.translations.append([idx, _NormalizedString(msg)])
             else:
                 self.translations.append([0, _NormalizedString(arg)])
 
diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py
index f0431f811..1c083fe76 100644
--- a/tests/messages/test_pofile.py
+++ b/tests/messages/test_pofile.py
@@ -544,6 +544,34 @@ def test_abort_invalid_po_file(self):
         output = pofile.read_po(buf, locale='fr', abort_invalid=False)
         assert isinstance(output, Catalog)
 
+    def test_invalid_msgstr_plural(self):
+        # msgstr plural broken
+        invalid_1 = '''
+        msgid "A"
+        msgstr[
+        '''
+        invalid_2 = '''
+        msgid "A"
+        msgstr[] "no index"
+        '''
+        invalid_3 = '''
+        msgid "A"
+        msgstr[bah] "index is not a number"
+        '''
+        # no content
+        invalid_4 = '''
+        msgid "A"
+        msgstr[0]
+        '''
+        for incorrectly_plural in (invalid_1, invalid_2, invalid_3, invalid_4):
+            buf = StringIO(incorrectly_plural)
+            with pytest.raises(pofile.PoFileError):
+                pofile.read_po(buf, locale='fr', abort_invalid=True)
+            buf.seek(0)
+            output = pofile.read_po(buf, locale='fr', abort_invalid=False)
+            assert isinstance(output, Catalog)
+
+
     def test_invalid_pofile_with_abort_flag(self):
         parser = pofile.PoFileParser(None, abort_invalid=True)
         lineno = 10

From 00062fbe4b8619a6aaa29b16825a6d974f03a3ea Mon Sep 17 00:00:00 2001
From: Rodrigo Tobar <rtobar@icrar.org>
Date: Sun, 17 Nov 2024 11:06:03 +0800
Subject: [PATCH 6/7] Detect incorrectly delimited strings

Strings should be delimited on both ends by double quotes, but this
is currently not being been detected, and content is simply being
incorrectly trimmed.

This commit adds a check for each string to verify it starts and ends
with a double quote character, issuing a warning/error if that's not the
case (and fixing it as appropriate).

A few new test cases have been added to check that the lack of double
quotes to delimit strings issues errors as expected.

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
---
 babel/messages/pofile.py      | 24 +++++++++++++++---
 tests/messages/test_pofile.py | 46 +++++++++++++++++++++++------------
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index d927ecf1c..3d8eb8b05 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -278,6 +278,18 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
 
         self.obsolete = obsolete
 
+        def _normalized_string(s):
+            # whole lines are already stripped on both ends
+            s = s.lstrip()
+            assert s, "string to normalize shouldn't be empty"
+            if s[0] != '"':
+                self._invalid_pofile(line, lineno, "String must be delimited on the left by double quotes")
+                s = '"' + s
+            if s[-1] != '"':
+                self._invalid_pofile(line, lineno, "String must be delimited on the right by double quotes")
+                s += '"'
+            return _NormalizedString(s)
+
         # The line that has the msgid is stored as the offset of the msg
         # should this be the msgctxt if it has one?
         if keyword == 'msgid':
@@ -286,7 +298,7 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
         if keyword in ['msgid', 'msgid_plural']:
             self.in_msgctxt = False
             self.in_msgid = True
-            self.messages.append(_NormalizedString(arg))
+            self.messages.append(_normalized_string(arg))
 
         elif keyword == 'msgstr':
             self.in_msgid = False
@@ -306,13 +318,13 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
                 if len(msg) < 2:
                     self._invalid_pofile(line, lineno, "msgstr plural doesn't have a message")
                     msg = '""'
-                self.translations.append([idx, _NormalizedString(msg)])
+                self.translations.append([idx, _normalized_string(msg)])
             else:
-                self.translations.append([0, _NormalizedString(arg)])
+                self.translations.append([0, _normalized_string(arg)])
 
         elif keyword == 'msgctxt':
             self.in_msgctxt = True
-            self.context = _NormalizedString(arg)
+            self.context = _normalized_string(arg)
 
     def _process_string_continuation_line(self, line, lineno) -> None:
         if self.in_msgid:
@@ -324,6 +336,10 @@ def _process_string_continuation_line(self, line, lineno) -> None:
         else:
             self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt")
             return
+        assert line[0] == '"'
+        if line[-1] != '"':
+            self._invalid_pofile(line, lineno, "Continuation string must end with double quotes")
+            line += '"'
         s.append(line)
 
     def _process_comment(self, line) -> None:
diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py
index 1c083fe76..b875eb917 100644
--- a/tests/messages/test_pofile.py
+++ b/tests/messages/test_pofile.py
@@ -505,6 +505,7 @@ def test_with_location(self):
 
 
     def test_abort_invalid_po_file(self):
+        # right double quote missing on msgstr, continuation missing both double quotes
         invalid_po = '''
             msgctxt ""
             "{\"checksum\": 2148532640, \"cxt\": \"collector_thankyou\", \"id\": "
@@ -517,6 +518,7 @@ def test_abort_invalid_po_file(self):
             Pour toute question, veuillez communiquer avec Fulano  à nadie@blah.com
             "
         '''
+        # as above, but contains ascii-only characters
         invalid_po_2 = '''
             msgctxt ""
             "{\"checksum\": 2148532640, \"cxt\": \"collector_thankyou\", \"id\": "
@@ -529,20 +531,29 @@ def test_abort_invalid_po_file(self):
             Pour toute question, veuillez communiquer avec Fulano a fulano@blah.com
             "
             '''
-        # Catalog not created, throws Unicode Error
-        buf = StringIO(invalid_po)
-        output = pofile.read_po(buf, locale='fr', abort_invalid=False)
-        assert isinstance(output, Catalog)
-
-        # Catalog not created, throws PoFileError
-        buf = StringIO(invalid_po_2)
-        with pytest.raises(pofile.PoFileError):
-            pofile.read_po(buf, locale='fr', abort_invalid=True)
-
-        # Catalog is created with warning, no abort
-        buf = StringIO(invalid_po_2)
-        output = pofile.read_po(buf, locale='fr', abort_invalid=False)
-        assert isinstance(output, Catalog)
+        # left double quote missing in msgid
+        invalid_po_3 = '''
+            msgid *A"
+            msgstr "B"
+        '''
+        # right double quote missing in msgstr
+        invalid_po_4 = '''
+            msgid "A"
+            msgstr "B*
+        '''
+        # right double quote missing in msgstr continuation
+        invalid_po_5 = '''
+            msgid "A"
+            msgstr ""
+            "*
+        '''
+        for incorrectly_delimited_po_string in (invalid_po, invalid_po_2, invalid_po_3, invalid_po_4, invalid_po_5):
+            buf = StringIO(incorrectly_delimited_po_string)
+            with pytest.raises(pofile.PoFileError):
+                pofile.read_po(buf, locale='fr', abort_invalid=True)
+            buf.seek(0)
+            output = pofile.read_po(buf, locale='fr', abort_invalid=False)
+            assert isinstance(output, Catalog)
 
     def test_invalid_msgstr_plural(self):
         # msgstr plural broken
@@ -563,7 +574,12 @@ def test_invalid_msgstr_plural(self):
         msgid "A"
         msgstr[0]
         '''
-        for incorrectly_plural in (invalid_1, invalid_2, invalid_3, invalid_4):
+        # not correctly delimited
+        invalid_5 = '''
+        msgid "A"
+        msgstr[0] not delimited
+        '''
+        for incorrectly_plural in (invalid_1, invalid_2, invalid_3, invalid_4, invalid_5):
             buf = StringIO(incorrectly_plural)
             with pytest.raises(pofile.PoFileError):
                 pofile.read_po(buf, locale='fr', abort_invalid=True)

From ff5213b54cad7be65bcf7112abea51a40c4d6517 Mon Sep 17 00:00:00 2001
From: Rodrigo Tobar <rtobar@icrar.org>
Date: Sun, 17 Nov 2024 23:05:01 +0800
Subject: [PATCH 7/7] Formalise constraint on _NormalizeString's inputs

Now that all strings given as inputs to _NormalizeString have been
verified (or corrected) to be correctly delimited with double quotes,
there's no reason to continue doing an internal strip anymore. Moreover,
we can express this internal constraint with an assertion to avoid
issues in the future.

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
---
 babel/messages/pofile.py                 |  3 ++-
 tests/messages/test_normalized_string.py | 21 ++++++++-------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py
index 3d8eb8b05..4190d4104 100644
--- a/babel/messages/pofile.py
+++ b/babel/messages/pofile.py
@@ -144,7 +144,8 @@ def __init__(self, *args: str) -> None:
             self.append(arg)
 
     def append(self, s: str) -> None:
-        self._strs.append(s.strip())
+        assert s[0] == '"' and s[-1] == '"'
+        self._strs.append(s)
 
     def denormalize(self) -> str:
         return ''.join(map(unescape, self._strs))
diff --git a/tests/messages/test_normalized_string.py b/tests/messages/test_normalized_string.py
index 735603770..f398a1680 100644
--- a/tests/messages/test_normalized_string.py
+++ b/tests/messages/test_normalized_string.py
@@ -4,19 +4,14 @@
 
 
 def test_normalized_string():
-    ab1 = _NormalizedString('"a"', '"b" ')
-    ab2 = _NormalizedString('"a"', ' "b"')
-    ac1 = _NormalizedString('"a"', '"c"')
-    ac2 = _NormalizedString('  "a"', '"c"  ')
+    ab = _NormalizedString('"a"', '"b"')
+    ac = _NormalizedString('"a"', '"c"')
     z = _NormalizedString()
-    assert ab1 == ab2 and ac1 == ac2  # __eq__
-    assert ab1 < ac1  # __lt__
-    assert ac1 > ab2  # __gt__
-    assert ac1 >= ac2  # __ge__
-    assert ab1 <= ab2  # __le__
-    assert ab1 != ac1  # __ne__
+    assert ab < ac  # __lt__
+    assert ac > ab  # __gt__
+    assert ab != ac  # __ne__
     assert not z  # __nonzero__ / __bool__
-    assert sorted([ab1, ab2, ac1]) == [ab1, ab2, ac1] # sorted() is stable
+    assert sorted([ac, ab, z]) == [z, ab, ac] # sorted() is stable
 
 
 @pytest.mark.parametrize(
@@ -38,10 +33,10 @@ def test_denormalized_simple_normalized_string(original, denormalized):
     "originals, denormalized",
     (
         (('"a"', '"b"'), "ab"),
-        (('"a" ', '"b"'), "ab"),
+        (('"a"', '"b"'), "ab"),
         (('"ab"', '""'), "ab"),
         (('"ab"', '"c"'), "abc"),
-        (('"a"', '     "bc"'), "abc"),
+        (('"a"', '"bc"'), "abc"),
     ),
 )
 def test_denormalized_multi_normalized_string(originals, denormalized):