python · picnixz · Jan 23, 2025 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py
@@ -839,7 +839,8 @@ def test_codec_ignore_errors_handler(self):
 
     def test_codec_replace_errors_handler(self):
         handler = _testcapi.codec_replace_errors
-        self.do_test_codec_errors_handler(handler, self.all_unicode_errors)
+        self.do_test_codec_errors_handler(handler, self.all_unicode_errors,
+                                          safe=True)
 
     def test_codec_xmlcharrefreplace_errors_handler(self):
         handler = _testcapi.codec_xmlcharrefreplace_errors
@@ -853,12 +854,12 @@ def test_codec_namereplace_errors_handler(self):
         handler = _testlimitedcapi.codec_namereplace_errors
         self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
 
-    def do_test_codec_errors_handler(self, handler, exceptions):
+    def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False):
         at_least_one = False
         for exc in exceptions:
             # See https://github.com/python/cpython/issues/123378 and related
             # discussion and issues for details.
-            if self._exception_may_crash(exc):
+            if not safe and self._exception_may_crash(exc):
                 continue
 
             at_least_one = True

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-32-58.gh-issue-126004.CYAwTB.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-32-58.gh-issue-126004.CYAwTB.rst
@@ -0,0 +1,3 @@
+Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
+values in the :func:`codecs.replace_errors` error handler. Patch by Bénédikt
+Tran.
diff --git a/Python/codecs.c b/Python/codecs.c
@@ -702,55 +702,62 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc)
 
 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
 {
-    Py_ssize_t start, end, i, len;
+    Py_ssize_t start, end;
 
     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
-        PyObject *res;
-        Py_UCS1 *outp;
-        if (PyUnicodeEncodeError_GetStart(exc, &start))
+        if (_PyUnicodeError_GetParams(exc, NULL, NULL, &start, &end, false) < 0) {
             return NULL;
-        if (PyUnicodeEncodeError_GetEnd(exc, &end))
-            return NULL;
-        len = end - start;
-        res = PyUnicode_New(len, '?');
-        if (res == NULL)
+        }
+        if (end <= start) {
+            goto oob;
+        }
+        Py_ssize_t len = end - start;
+        PyObject *res = PyUnicode_New(len, '?');
+        if (res == NULL) {
             return NULL;
+        }
         assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
-        outp = PyUnicode_1BYTE_DATA(res);
-        for (i = 0; i < len; ++i)
-            outp[i] = '?';
+        Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
+        memset(outp, (int)'?', sizeof(Py_UCS1) * len);
         assert(_PyUnicode_CheckConsistency(res, 1));
         return Py_BuildValue("(Nn)", res, end);
     }
     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
-        if (PyUnicodeDecodeError_GetEnd(exc, &end))
+        // _PyUnicodeError_GetParams() is slightly faster than the public getter
+        if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL, &end, true) < 0) {
             return NULL;
+        }
         return Py_BuildValue("(Cn)",
                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
                              end);
     }
     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
-        PyObject *res;
-        Py_UCS2 *outp;
-        if (PyUnicodeTranslateError_GetStart(exc, &start))
-            return NULL;
-        if (PyUnicodeTranslateError_GetEnd(exc, &end))
+        if (_PyUnicodeError_GetParams(exc, NULL, NULL, &start, &end, false) < 0) {
             return NULL;
-        len = end - start;
-        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
-        if (res == NULL)
+        }
+        if (end <= start) {
+            goto oob;
+        }
+        Py_ssize_t len = end - start;
+        PyObject *res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
+        if (res == NULL) {
             return NULL;
+        }
         assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
-        outp = PyUnicode_2BYTE_DATA(res);
-        for (i = 0; i < len; i++)
+        Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
+        for (Py_ssize_t i = 0; i < len; ++i) {
             outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
+        }
         assert(_PyUnicode_CheckConsistency(res, 1));
         return Py_BuildValue("(Nn)", res, end);
     }
     else {
         wrong_exception_type(exc);
         return NULL;
     }
+
+oob:
+    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
 }
 
 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)