From 8004312aff7ed708ffb24757ccf9da51d3336b31 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 1 Nov 2023 02:34:04 +0100
Subject: [PATCH 1/9] gh-111089: Add cache to PyUnicode_AsUTF8() for embedded
 NUL

Add PyASCIIObject.state.embed_null member to Python str objects. It
is used as a cache by PyUnicode_AsUTF8() to only check once if a
string contains a null character. Strings created by
PyUnicode_FromString() initializes *embed_null* since the string
cannot contain a null character.

Global static strings now also initialize the *embed_null* member.
The chr(0) singleton ("\0" string) is the only static string which
contains a null character.
---
 Include/cpython/unicodeobject.h               |  9 +++-
 Include/internal/pycore_runtime_init.h        | 12 +++--
 .../internal/pycore_runtime_init_generated.h  |  2 +-
 ...-11-01-03-18-21.gh-issue-111089.GxXlz0.rst |  5 ++
 Modules/_testcapi/unicode.c                   |  7 ++-
 Objects/unicodeobject.c                       | 47 +++++++++++++++++--
 Tools/build/generate_global_objects.py        | 13 ++++-
 7 files changed, 85 insertions(+), 10 deletions(-)
 create mode 100644 Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst

diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index d200fa0622cef5..aa105460a44137 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -142,9 +142,16 @@ typedef struct {
         unsigned int ascii:1;
         /* The object is statically allocated. */
         unsigned int statically_allocated:1;
+        // Does the string embed null characters? Possible values:
+        //   0: No
+        //   1: Yes
+        //   2: Unknown, the string must be scanned
+        //   3: Invalid state (must not be used)
+        // Cache used by PyUnicode_AsUTF8() to avoid calling strlen().
+        unsigned int embed_null:2;
         /* Padding to ensure that PyUnicode_DATA() is always aligned to
            4 bytes (see issue #19537 on m68k). */
-        unsigned int :24;
+        unsigned int :22;
     } state;
 } PyASCIIObject;
 
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
index 0799b7e701ce95..06d4a742d61ab5 100644
--- a/Include/internal/pycore_runtime_init.h
+++ b/Include/internal/pycore_runtime_init.h
@@ -215,7 +215,7 @@ extern PyTypeObject _PyExc_MemoryError;
         _PyBytes_SIMPLE_INIT((CH), 1) \
     }
 
-#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII) \
+#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII, EMBED_NUL) \
     { \
         .ob_base = _PyObject_HEAD_INIT(&PyUnicode_Type), \
         .length = sizeof(LITERAL) - 1, \
@@ -225,11 +225,17 @@ extern PyTypeObject _PyExc_MemoryError;
             .compact = 1, \
             .ascii = (ASCII), \
             .statically_allocated = 1, \
+            .embed_null = (EMBED_NUL), \
         }, \
     }
 #define _PyASCIIObject_INIT(LITERAL) \
     { \
-        ._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1), \
+        ._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 0), \
+        ._data = (LITERAL) \
+    }
+#define _PyASCIIObject_INIT_embed_null(LITERAL) \
+    { \
+        ._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 1), \
         ._data = (LITERAL) \
     }
 #define INIT_STR(NAME, LITERAL) \
@@ -239,7 +245,7 @@ extern PyTypeObject _PyExc_MemoryError;
 #define _PyUnicode_LATIN1_INIT(LITERAL, UTF8) \
     { \
         ._latin1 = { \
-            ._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0), \
+            ._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0, 0), \
             .utf8 = (UTF8), \
             .utf8_length = sizeof(UTF8) - 1, \
         }, \
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index d41a7478db663f..1fe984112c81b8 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1259,7 +1259,7 @@ extern "C" {
 }
 
 #define _Py_str_ascii_INIT { \
-    _PyASCIIObject_INIT("\x00"), \
+    _PyASCIIObject_INIT_embed_null("\x00"), \
     _PyASCIIObject_INIT("\x01"), \
     _PyASCIIObject_INIT("\x02"), \
     _PyASCIIObject_INIT("\x03"), \
diff --git a/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst b/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst
new file mode 100644
index 00000000000000..d797958c9bc67f
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst	
@@ -0,0 +1,5 @@
+Add ``PyASCIIObject.state.embed_null`` member to Python str objects. It is
+used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a string
+contains a null character. Strings created by :c:func:`PyUnicode_FromString`
+initializes *embed_null* since the string cannot contain a null character.
+Patch by Victor Stinner.
diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c
index a10183dddeca98..950b924694710f 100644
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@@ -301,7 +301,12 @@ unicode_fromstring(PyObject *self, PyObject *arg)
     if (!PyArg_Parse(arg, "z#", &s, &size)) {
         return NULL;
     }
-    return PyUnicode_FromString(s);
+    PyObject *unicode = PyUnicode_FromString(s);
+    if (unicode == NULL) {
+        return NULL;
+    }
+    assert(((PyASCIIObject*)unicode)->state.embed_null == 0);
+    return unicode;
 }
 
 /* Test PyUnicode_FromKindAndData() */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 87636efcfca050..da9b47b9c9b4f4 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -205,6 +205,10 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 static inline int unicode_is_finalizing(void);
 static int unicode_is_singleton(PyObject *unicode);
 #endif
+static inline Py_ssize_t
+findchar(const void *s, int kind,
+         Py_ssize_t size, Py_UCS4 ch,
+         int direction);
 
 
 // Return a reference to the immortal empty string singleton.
@@ -623,6 +627,15 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
         }
         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
     }
+
+    if (_PyUnicode_STATE(ascii).embed_null != 2) {
+        Py_ssize_t pos = findchar(PyUnicode_DATA(ascii),
+                                  PyUnicode_KIND(ascii),
+                                  PyUnicode_GET_LENGTH(ascii),
+                                  0, 1);
+        assert(_PyUnicode_STATE(ascii).embed_null == (pos >= 0));
+    }
+
     return 1;
 
 #undef CHECK
@@ -1253,6 +1266,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     _PyUnicode_STATE(unicode).compact = 1;
     _PyUnicode_STATE(unicode).ascii = is_ascii;
     _PyUnicode_STATE(unicode).statically_allocated = 0;
+    _PyUnicode_STATE(unicode).embed_null = 2;
     if (is_ascii) {
         ((char*)data)[size] = 0;
     }
@@ -1890,7 +1904,16 @@ PyUnicode_FromString(const char *u)
         PyErr_SetString(PyExc_OverflowError, "input too long");
         return NULL;
     }
-    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
+    PyObject *unicode;
+    unicode = PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
+    if (unicode != NULL) {
+        // PyUnicode_DecodeUTF8Stateful(u, strlen(u)) cannot create NUL
+        // characters: the UTF-8 decoder with the strict error handler only
+        // creates a NUL character if the input string contains a NUL byte
+        // which cannot be the case here.
+        _PyUnicode_STATE(unicode).embed_null = 0;
+    }
+    return unicode;
 }
 
 
@@ -1932,6 +1955,7 @@ _PyUnicode_FromId(_Py_Identifier *id)
     if (!obj) {
         return NULL;
     }
+    _PyUnicode_STATE(obj).embed_null = 0;
     PyUnicode_InternInPlace(&obj);
 
     if (index >= ids->size) {
@@ -3846,10 +3870,27 @@ PyUnicode_AsUTF8(PyObject *unicode)
 {
     Py_ssize_t size;
     const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size);
-    if (utf8 != NULL && strlen(utf8) != (size_t)size) {
-        PyErr_SetString(PyExc_ValueError, "embedded null character");
+    if (utf8 == NULL) {
         return NULL;
     }
+
+    // Cache to avoid calling O(n) strlen() operation at every
+    // PyUnicode_AsUTF8() call on the same object.
+    if (_PyUnicode_STATE(unicode).embed_null == 2) {
+        if (strlen(utf8) != (size_t)size) {
+            _PyUnicode_STATE(unicode).embed_null = 1;
+        }
+        else {
+            _PyUnicode_STATE(unicode).embed_null = 0;
+        }
+    }
+
+    if (_PyUnicode_STATE(unicode).embed_null == 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "embedded null character");
+        return NULL;
+    }
+
     return utf8;
 }
 
diff --git a/Tools/build/generate_global_objects.py b/Tools/build/generate_global_objects.py
index ded19ee489e79b..fbf33ef1a4ad13 100644
--- a/Tools/build/generate_global_objects.py
+++ b/Tools/build/generate_global_objects.py
@@ -232,6 +232,14 @@ def open_for_changes(filename, orig):
 def generate_global_strings(identifiers, strings):
     filename = os.path.join(INTERNAL, 'pycore_global_strings.h')
 
+    # NUL characters are not supported; see _PyASCIIObject_INIT_embed_null().
+    for identifier in identifiers:
+        if "\0" in identifier:
+            raise Exception(f"an identifier contains a null character: {identifier!r}")
+    for string in strings:
+        if "\0" in string:
+            raise Exception(f"a string contains a null character: {string!r}")
+
     # Read the non-generated part of the file.
     with open(filename) as infile:
         orig = infile.read()
@@ -321,7 +329,10 @@ def generate_runtime_init(identifiers, strings):
         printer.write('')
         with printer.block('#define _Py_str_ascii_INIT', continuation=True):
             for i in range(128):
-                printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
+                if i == 0:
+                    printer.write(f'_PyASCIIObject_INIT_embed_null("\\x{i:02x}"),')
+                else:
+                    printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
                 immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).ascii[{i}]')
         printer.write('')
         with printer.block('#define _Py_str_latin1_INIT', continuation=True):

From a7e93c9ea9ba52738c9fb0f878bb1f8bb19cd496 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 1 Nov 2023 03:53:44 +0100
Subject: [PATCH 2/9] fixup! gh-111089: Add cache to PyUnicode_AsUTF8() for
 embedded NUL

Fix unicode_subtype_new
---
 Objects/unicodeobject.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index da9b47b9c9b4f4..a8b02180e4e83a 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -14668,6 +14668,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
     _PyUnicode_STATE(self).compact = 0;
     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
     _PyUnicode_STATE(self).statically_allocated = 0;
+    _PyUnicode_STATE(self).embed_null = 2;
     _PyUnicode_UTF8_LENGTH(self) = 0;
     _PyUnicode_UTF8(self) = NULL;
     _PyUnicode_DATA_ANY(self) = NULL;

From 4ccd7d9c66403db8477c16ae9068cccfd81f5cff Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 1 Nov 2023 04:38:39 +0100
Subject: [PATCH 3/9] Fix _PyUnicode_CheckConsistency() in release mode

---
 Objects/unicodeobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index a8b02180e4e83a..9be204be60c3ba 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -633,7 +633,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
                                   PyUnicode_KIND(ascii),
                                   PyUnicode_GET_LENGTH(ascii),
                                   0, 1);
-        assert(_PyUnicode_STATE(ascii).embed_null == (pos >= 0));
+        CHECK(_PyUnicode_STATE(ascii).embed_null == (pos >= 0));
     }
 
     return 1;

From 3c4844f895b1b3911cf1c48cdfa778ef6fe1e562 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 1 Nov 2023 22:35:58 +0100
Subject: [PATCH 4/9] Add constant

---
 Objects/unicodeobject.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 9be204be60c3ba..ccfe64dd416d3f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -189,6 +189,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
 #  define OVERALLOCATE_FACTOR 4
 #endif
 
+#define EMBED_NULL_UNKNOWN 2
+
 /* Forward declaration */
 static inline int
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
@@ -628,7 +630,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
     }
 
-    if (_PyUnicode_STATE(ascii).embed_null != 2) {
+    if (_PyUnicode_STATE(ascii).embed_null != EMBED_NULL_UNKNOWN) {
         Py_ssize_t pos = findchar(PyUnicode_DATA(ascii),
                                   PyUnicode_KIND(ascii),
                                   PyUnicode_GET_LENGTH(ascii),
@@ -1266,7 +1268,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     _PyUnicode_STATE(unicode).compact = 1;
     _PyUnicode_STATE(unicode).ascii = is_ascii;
     _PyUnicode_STATE(unicode).statically_allocated = 0;
-    _PyUnicode_STATE(unicode).embed_null = 2;
+    _PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN;
     if (is_ascii) {
         ((char*)data)[size] = 0;
     }
@@ -3876,7 +3878,7 @@ PyUnicode_AsUTF8(PyObject *unicode)
 
     // Cache to avoid calling O(n) strlen() operation at every
     // PyUnicode_AsUTF8() call on the same object.
-    if (_PyUnicode_STATE(unicode).embed_null == 2) {
+    if (_PyUnicode_STATE(unicode).embed_null == EMBED_NULL_UNKNOWN) {
         if (strlen(utf8) != (size_t)size) {
             _PyUnicode_STATE(unicode).embed_null = 1;
         }
@@ -14668,7 +14670,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
     _PyUnicode_STATE(self).compact = 0;
     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
     _PyUnicode_STATE(self).statically_allocated = 0;
-    _PyUnicode_STATE(self).embed_null = 2;
+    _PyUnicode_STATE(self).embed_null = EMBED_NULL_UNKNOWN;
     _PyUnicode_UTF8_LENGTH(self) = 0;
     _PyUnicode_UTF8(self) = NULL;
     _PyUnicode_DATA_ANY(self) = NULL;

From e2247511e9228c0810e153b31de4fee369b43e75 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 1 Nov 2023 23:16:32 +0100
Subject: [PATCH 5/9] unicode_resize() clears embed_null cache

---
 Objects/unicodeobject.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ccfe64dd416d3f..13dc26b8dc9869 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1015,6 +1015,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
         _PyUnicode_UTF8(unicode) = NULL;
         _PyUnicode_UTF8_LENGTH(unicode) = 0;
     }
+    _PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN;
 #ifdef Py_TRACE_REFS
     _Py_ForgetReference(unicode);
 #endif
@@ -1068,6 +1069,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
         _PyUnicode_UTF8(unicode) = NULL;
         _PyUnicode_UTF8_LENGTH(unicode) = 0;
     }
+    _PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN;
 
     data = (PyObject *)PyObject_Realloc(data, new_size);
     if (data == NULL) {
@@ -11082,6 +11084,7 @@ PyUnicode_Append(PyObject **p_left, PyObject *right)
         Py_DECREF(left);
         *p_left = res;
     }
+    assert(_PyUnicode_STATE(*p_left).embed_null == EMBED_NULL_UNKNOWN);
     assert(_PyUnicode_CheckConsistency(*p_left, 1));
     return;
 

From 65c667132ddff29e5d253e43412d840afbdc49c3 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 1 Nov 2023 23:34:18 +0100
Subject: [PATCH 6/9] Add What's New entry

---
 Doc/whatsnew/3.13.rst                                    | 9 ++++++++-
 .../C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
index e5f39c58490b85..91ddcf313c8f3e 100644
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -1120,6 +1120,13 @@ New Features
 * Add :c:func:`PyUnicode_AsUTF8` function to the limited C API.
   (Contributed by Victor Stinner in :gh:`111089`.)
 
+* Add ``PyASCIIObject.state.embed_null`` member to Python :class:`str` objects.
+  It is used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a
+  string contains a null character. Strings created by
+  :c:func:`PyUnicode_FromString` initializes *embed_null* to 0 since the string
+  cannot contain a null character.
+  (Contributed by Victor Stinner in :gh:`111089`.)
+
 
 Porting to Python 3.13
 ----------------------
@@ -1192,7 +1199,7 @@ Porting to Python 3.13
 
 * The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string
   contains embedded null characters. To accept embedded null characters and
-  truncate on purpose at the first null byte,
+  truncate on purpose at the first null character,
   ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead.
   (Contributed by Victor Stinner in :gh:`111089`.)
 
diff --git a/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst b/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst
index d797958c9bc67f..869c1639efa332 100644
--- a/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst	
+++ b/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst	
@@ -1,5 +1,5 @@
 Add ``PyASCIIObject.state.embed_null`` member to Python str objects. It is
 used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a string
 contains a null character. Strings created by :c:func:`PyUnicode_FromString`
-initializes *embed_null* since the string cannot contain a null character.
+initializes *embed_null* to 0 since the string cannot contain a null character.
 Patch by Victor Stinner.

From 30bb7254a334ef0c276db4d00432046cefed9514 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 1 Nov 2023 23:38:58 +0100
Subject: [PATCH 7/9] Make the fast path faster.

Suggestion by Serhiy
---
 Objects/unicodeobject.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 13dc26b8dc9869..58f2597d3816fe 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3880,19 +3880,21 @@ PyUnicode_AsUTF8(PyObject *unicode)
 
     // Cache to avoid calling O(n) strlen() operation at every
     // PyUnicode_AsUTF8() call on the same object.
-    if (_PyUnicode_STATE(unicode).embed_null == EMBED_NULL_UNKNOWN) {
-        if (strlen(utf8) != (size_t)size) {
-            _PyUnicode_STATE(unicode).embed_null = 1;
-        }
-        else {
-            _PyUnicode_STATE(unicode).embed_null = 0;
+    if (_PyUnicode_STATE(unicode).embed_null != 0) {
+        if (_PyUnicode_STATE(unicode).embed_null == EMBED_NULL_UNKNOWN) {
+            if (strlen(utf8) != (size_t)size) {
+                _PyUnicode_STATE(unicode).embed_null = 1;
+            }
+            else {
+                _PyUnicode_STATE(unicode).embed_null = 0;
+            }
         }
-    }
 
-    if (_PyUnicode_STATE(unicode).embed_null == 1) {
-        PyErr_SetString(PyExc_ValueError,
-                        "embedded null character");
-        return NULL;
+        if (_PyUnicode_STATE(unicode).embed_null == 1) {
+            PyErr_SetString(PyExc_ValueError,
+                            "embedded null character");
+            return NULL;
+        }
     }
 
     return utf8;

From 07975be323e9dec1de4b584830617c2f382b3c8d Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Thu, 2 Nov 2023 00:23:57 +0100
Subject: [PATCH 8/9] Revert test unicode_fromstring() change

---
 Modules/_testcapi/unicode.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c
index 950b924694710f..a10183dddeca98 100644
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@@ -301,12 +301,7 @@ unicode_fromstring(PyObject *self, PyObject *arg)
     if (!PyArg_Parse(arg, "z#", &s, &size)) {
         return NULL;
     }
-    PyObject *unicode = PyUnicode_FromString(s);
-    if (unicode == NULL) {
-        return NULL;
-    }
-    assert(((PyASCIIObject*)unicode)->state.embed_null == 0);
-    return unicode;
+    return PyUnicode_FromString(s);
 }
 
 /* Test PyUnicode_FromKindAndData() */

From e3c6fa51d8c013e40f707b9d4bb9da9986f983dc Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Thu, 2 Nov 2023 22:12:47 +0100
Subject: [PATCH 9/9] Set embed_null in more cases on new strings

* unicode_char()
* PyUnicode_FromWideChar(str, -1)
* _PyUnicode_Copy()
---
 Objects/unicodeobject.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 58f2597d3816fe..369d4660e8339a 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1795,6 +1795,8 @@ unicode_char(Py_UCS4 ch)
         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
     }
+    // ch >= 256 and so cannot be 0
+    _PyUnicode_STATE(unicode).embed_null = 0;
     assert(_PyUnicode_CheckConsistency(unicode, 1));
     return unicode;
 }
@@ -1811,8 +1813,13 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
         return NULL;
     }
 
+    unsigned int embed_null;
     if (size == -1) {
         size = wcslen(u);
+        embed_null = 0;
+    }
+    else {
+        embed_null = EMBED_NULL_UNKNOWN;
     }
 
     /* If the Unicode data is known at construction time, we can apply
@@ -1877,6 +1884,7 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
     default:
         Py_UNREACHABLE();
     }
+    _PyUnicode_STATE(unicode).embed_null = embed_null;
 
     return unicode_result(unicode);
 }
@@ -2232,6 +2240,7 @@ _PyUnicode_Copy(PyObject *unicode)
 
     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
               length * PyUnicode_KIND(unicode));
+    _PyUnicode_STATE(copy).embed_null = _PyUnicode_STATE(unicode).embed_null;
     assert(_PyUnicode_CheckConsistency(copy, 1));
     return copy;
 }