From 8004312aff7ed708ffb24757ccf9da51d3336b31 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 1 Nov 2023 02:34:04 +0100 Subject: [PATCH 1/9] gh-111089: Add cache to PyUnicode_AsUTF8() for embedded NUL Add PyASCIIObject.state.embed_null member to Python str objects. It is used as a cache by PyUnicode_AsUTF8() to only check once if a string contains a null character. Strings created by PyUnicode_FromString() initializes *embed_null* since the string cannot contain a null character. Global static strings now also initialize the *embed_null* member. The chr(0) singleton ("\0" string) is the only static string which contains a null character. --- Include/cpython/unicodeobject.h | 9 +++- Include/internal/pycore_runtime_init.h | 12 +++-- .../internal/pycore_runtime_init_generated.h | 2 +- ...-11-01-03-18-21.gh-issue-111089.GxXlz0.rst | 5 ++ Modules/_testcapi/unicode.c | 7 ++- Objects/unicodeobject.c | 47 +++++++++++++++++-- Tools/build/generate_global_objects.py | 13 ++++- 7 files changed, 85 insertions(+), 10 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index d200fa0622cef5..aa105460a44137 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -142,9 +142,16 @@ typedef struct { unsigned int ascii:1; /* The object is statically allocated. */ unsigned int statically_allocated:1; + // Does the string embed null characters? Possible values: + // 0: No + // 1: Yes + // 2: Unknown, the string must be scanned + // 3: Invalid state (must not be used) + // Cache used by PyUnicode_AsUTF8() to avoid calling strlen(). + unsigned int embed_null:2; /* Padding to ensure that PyUnicode_DATA() is always aligned to 4 bytes (see issue #19537 on m68k). */ - unsigned int :24; + unsigned int :22; } state; } PyASCIIObject; diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h index 0799b7e701ce95..06d4a742d61ab5 100644 --- a/Include/internal/pycore_runtime_init.h +++ b/Include/internal/pycore_runtime_init.h @@ -215,7 +215,7 @@ extern PyTypeObject _PyExc_MemoryError; _PyBytes_SIMPLE_INIT((CH), 1) \ } -#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII) \ +#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII, EMBED_NUL) \ { \ .ob_base = _PyObject_HEAD_INIT(&PyUnicode_Type), \ .length = sizeof(LITERAL) - 1, \ @@ -225,11 +225,17 @@ extern PyTypeObject _PyExc_MemoryError; .compact = 1, \ .ascii = (ASCII), \ .statically_allocated = 1, \ + .embed_null = (EMBED_NUL), \ }, \ } #define _PyASCIIObject_INIT(LITERAL) \ { \ - ._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1), \ + ._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 0), \ + ._data = (LITERAL) \ + } +#define _PyASCIIObject_INIT_embed_null(LITERAL) \ + { \ + ._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 1), \ ._data = (LITERAL) \ } #define INIT_STR(NAME, LITERAL) \ @@ -239,7 +245,7 @@ extern PyTypeObject _PyExc_MemoryError; #define _PyUnicode_LATIN1_INIT(LITERAL, UTF8) \ { \ ._latin1 = { \ - ._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0), \ + ._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0, 0), \ .utf8 = (UTF8), \ .utf8_length = sizeof(UTF8) - 1, \ }, \ diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index d41a7478db663f..1fe984112c81b8 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -1259,7 +1259,7 @@ extern "C" { } #define _Py_str_ascii_INIT { \ - _PyASCIIObject_INIT("\x00"), \ + _PyASCIIObject_INIT_embed_null("\x00"), \ _PyASCIIObject_INIT("\x01"), \ _PyASCIIObject_INIT("\x02"), \ _PyASCIIObject_INIT("\x03"), \ diff --git a/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst b/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst new file mode 100644 index 00000000000000..d797958c9bc67f --- /dev/null +++ b/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst @@ -0,0 +1,5 @@ +Add ``PyASCIIObject.state.embed_null`` member to Python str objects. It is +used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a string +contains a null character. Strings created by :c:func:`PyUnicode_FromString` +initializes *embed_null* since the string cannot contain a null character. +Patch by Victor Stinner. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index a10183dddeca98..950b924694710f 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -301,7 +301,12 @@ unicode_fromstring(PyObject *self, PyObject *arg) if (!PyArg_Parse(arg, "z#", &s, &size)) { return NULL; } - return PyUnicode_FromString(s); + PyObject *unicode = PyUnicode_FromString(s); + if (unicode == NULL) { + return NULL; + } + assert(((PyASCIIObject*)unicode)->state.embed_null == 0); + return unicode; } /* Test PyUnicode_FromKindAndData() */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 87636efcfca050..da9b47b9c9b4f4 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -205,6 +205,10 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); #endif +static inline Py_ssize_t +findchar(const void *s, int kind, + Py_ssize_t size, Py_UCS4 ch, + int direction); // Return a reference to the immortal empty string singleton. @@ -623,6 +627,15 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) } CHECK(PyUnicode_READ(kind, data, ascii->length) == 0); } + + if (_PyUnicode_STATE(ascii).embed_null != 2) { + Py_ssize_t pos = findchar(PyUnicode_DATA(ascii), + PyUnicode_KIND(ascii), + PyUnicode_GET_LENGTH(ascii), + 0, 1); + assert(_PyUnicode_STATE(ascii).embed_null == (pos >= 0)); + } + return 1; #undef CHECK @@ -1253,6 +1266,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_STATE(unicode).compact = 1; _PyUnicode_STATE(unicode).ascii = is_ascii; _PyUnicode_STATE(unicode).statically_allocated = 0; + _PyUnicode_STATE(unicode).embed_null = 2; if (is_ascii) { ((char*)data)[size] = 0; } @@ -1890,7 +1904,16 @@ PyUnicode_FromString(const char *u) PyErr_SetString(PyExc_OverflowError, "input too long"); return NULL; } - return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); + PyObject *unicode; + unicode = PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); + if (unicode != NULL) { + // PyUnicode_DecodeUTF8Stateful(u, strlen(u)) cannot create NUL + // characters: the UTF-8 decoder with the strict error handler only + // creates a NUL character if the input string contains a NUL byte + // which cannot be the case here. + _PyUnicode_STATE(unicode).embed_null = 0; + } + return unicode; } @@ -1932,6 +1955,7 @@ _PyUnicode_FromId(_Py_Identifier *id) if (!obj) { return NULL; } + _PyUnicode_STATE(obj).embed_null = 0; PyUnicode_InternInPlace(&obj); if (index >= ids->size) { @@ -3846,10 +3870,27 @@ PyUnicode_AsUTF8(PyObject *unicode) { Py_ssize_t size; const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size); - if (utf8 != NULL && strlen(utf8) != (size_t)size) { - PyErr_SetString(PyExc_ValueError, "embedded null character"); + if (utf8 == NULL) { return NULL; } + + // Cache to avoid calling O(n) strlen() operation at every + // PyUnicode_AsUTF8() call on the same object. + if (_PyUnicode_STATE(unicode).embed_null == 2) { + if (strlen(utf8) != (size_t)size) { + _PyUnicode_STATE(unicode).embed_null = 1; + } + else { + _PyUnicode_STATE(unicode).embed_null = 0; + } + } + + if (_PyUnicode_STATE(unicode).embed_null == 1) { + PyErr_SetString(PyExc_ValueError, + "embedded null character"); + return NULL; + } + return utf8; } diff --git a/Tools/build/generate_global_objects.py b/Tools/build/generate_global_objects.py index ded19ee489e79b..fbf33ef1a4ad13 100644 --- a/Tools/build/generate_global_objects.py +++ b/Tools/build/generate_global_objects.py @@ -232,6 +232,14 @@ def open_for_changes(filename, orig): def generate_global_strings(identifiers, strings): filename = os.path.join(INTERNAL, 'pycore_global_strings.h') + # NUL characters are not supported; see _PyASCIIObject_INIT_embed_null(). + for identifier in identifiers: + if "\0" in identifier: + raise Exception(f"an identifier contains a null character: {identifier!r}") + for string in strings: + if "\0" in string: + raise Exception(f"a string contains a null character: {string!r}") + # Read the non-generated part of the file. with open(filename) as infile: orig = infile.read() @@ -321,7 +329,10 @@ def generate_runtime_init(identifiers, strings): printer.write('') with printer.block('#define _Py_str_ascii_INIT', continuation=True): for i in range(128): - printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),') + if i == 0: + printer.write(f'_PyASCIIObject_INIT_embed_null("\\x{i:02x}"),') + else: + printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),') immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).ascii[{i}]') printer.write('') with printer.block('#define _Py_str_latin1_INIT', continuation=True): From a7e93c9ea9ba52738c9fb0f878bb1f8bb19cd496 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 1 Nov 2023 03:53:44 +0100 Subject: [PATCH 2/9] fixup! gh-111089: Add cache to PyUnicode_AsUTF8() for embedded NUL Fix unicode_subtype_new --- Objects/unicodeobject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index da9b47b9c9b4f4..a8b02180e4e83a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -14668,6 +14668,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) _PyUnicode_STATE(self).compact = 0; _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; _PyUnicode_STATE(self).statically_allocated = 0; + _PyUnicode_STATE(self).embed_null = 2; _PyUnicode_UTF8_LENGTH(self) = 0; _PyUnicode_UTF8(self) = NULL; _PyUnicode_DATA_ANY(self) = NULL; From 4ccd7d9c66403db8477c16ae9068cccfd81f5cff Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 1 Nov 2023 04:38:39 +0100 Subject: [PATCH 3/9] Fix _PyUnicode_CheckConsistency() in release mode --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a8b02180e4e83a..9be204be60c3ba 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -633,7 +633,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) PyUnicode_KIND(ascii), PyUnicode_GET_LENGTH(ascii), 0, 1); - assert(_PyUnicode_STATE(ascii).embed_null == (pos >= 0)); + CHECK(_PyUnicode_STATE(ascii).embed_null == (pos >= 0)); } return 1; From 3c4844f895b1b3911cf1c48cdfa778ef6fe1e562 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 1 Nov 2023 22:35:58 +0100 Subject: [PATCH 4/9] Add constant --- Objects/unicodeobject.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9be204be60c3ba..ccfe64dd416d3f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -189,6 +189,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently # define OVERALLOCATE_FACTOR 4 #endif +#define EMBED_NULL_UNKNOWN 2 + /* Forward declaration */ static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); @@ -628,7 +630,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) CHECK(PyUnicode_READ(kind, data, ascii->length) == 0); } - if (_PyUnicode_STATE(ascii).embed_null != 2) { + if (_PyUnicode_STATE(ascii).embed_null != EMBED_NULL_UNKNOWN) { Py_ssize_t pos = findchar(PyUnicode_DATA(ascii), PyUnicode_KIND(ascii), PyUnicode_GET_LENGTH(ascii), @@ -1266,7 +1268,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_STATE(unicode).compact = 1; _PyUnicode_STATE(unicode).ascii = is_ascii; _PyUnicode_STATE(unicode).statically_allocated = 0; - _PyUnicode_STATE(unicode).embed_null = 2; + _PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN; if (is_ascii) { ((char*)data)[size] = 0; } @@ -3876,7 +3878,7 @@ PyUnicode_AsUTF8(PyObject *unicode) // Cache to avoid calling O(n) strlen() operation at every // PyUnicode_AsUTF8() call on the same object. - if (_PyUnicode_STATE(unicode).embed_null == 2) { + if (_PyUnicode_STATE(unicode).embed_null == EMBED_NULL_UNKNOWN) { if (strlen(utf8) != (size_t)size) { _PyUnicode_STATE(unicode).embed_null = 1; } @@ -14668,7 +14670,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) _PyUnicode_STATE(self).compact = 0; _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; _PyUnicode_STATE(self).statically_allocated = 0; - _PyUnicode_STATE(self).embed_null = 2; + _PyUnicode_STATE(self).embed_null = EMBED_NULL_UNKNOWN; _PyUnicode_UTF8_LENGTH(self) = 0; _PyUnicode_UTF8(self) = NULL; _PyUnicode_DATA_ANY(self) = NULL; From e2247511e9228c0810e153b31de4fee369b43e75 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 1 Nov 2023 23:16:32 +0100 Subject: [PATCH 5/9] unicode_resize() clears embed_null cache --- Objects/unicodeobject.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ccfe64dd416d3f..13dc26b8dc9869 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1015,6 +1015,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } + _PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN; #ifdef Py_TRACE_REFS _Py_ForgetReference(unicode); #endif @@ -1068,6 +1069,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } + _PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN; data = (PyObject *)PyObject_Realloc(data, new_size); if (data == NULL) { @@ -11082,6 +11084,7 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) Py_DECREF(left); *p_left = res; } + assert(_PyUnicode_STATE(*p_left).embed_null == EMBED_NULL_UNKNOWN); assert(_PyUnicode_CheckConsistency(*p_left, 1)); return; From 65c667132ddff29e5d253e43412d840afbdc49c3 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 1 Nov 2023 23:34:18 +0100 Subject: [PATCH 6/9] Add What's New entry --- Doc/whatsnew/3.13.rst | 9 ++++++++- .../C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index e5f39c58490b85..91ddcf313c8f3e 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1120,6 +1120,13 @@ New Features * Add :c:func:`PyUnicode_AsUTF8` function to the limited C API. (Contributed by Victor Stinner in :gh:`111089`.) +* Add ``PyASCIIObject.state.embed_null`` member to Python :class:`str` objects. + It is used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a + string contains a null character. Strings created by + :c:func:`PyUnicode_FromString` initializes *embed_null* to 0 since the string + cannot contain a null character. + (Contributed by Victor Stinner in :gh:`111089`.) + Porting to Python 3.13 ---------------------- @@ -1192,7 +1199,7 @@ Porting to Python 3.13 * The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string contains embedded null characters. To accept embedded null characters and - truncate on purpose at the first null byte, + truncate on purpose at the first null character, ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead. (Contributed by Victor Stinner in :gh:`111089`.) diff --git a/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst b/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst index d797958c9bc67f..869c1639efa332 100644 --- a/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst +++ b/Misc/NEWS.d/next/C API/2023-11-01-03-18-21.gh-issue-111089.GxXlz0.rst @@ -1,5 +1,5 @@ Add ``PyASCIIObject.state.embed_null`` member to Python str objects. It is used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a string contains a null character. Strings created by :c:func:`PyUnicode_FromString` -initializes *embed_null* since the string cannot contain a null character. +initializes *embed_null* to 0 since the string cannot contain a null character. Patch by Victor Stinner. From 30bb7254a334ef0c276db4d00432046cefed9514 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 1 Nov 2023 23:38:58 +0100 Subject: [PATCH 7/9] Make the fast path faster. Suggestion by Serhiy --- Objects/unicodeobject.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 13dc26b8dc9869..58f2597d3816fe 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3880,19 +3880,21 @@ PyUnicode_AsUTF8(PyObject *unicode) // Cache to avoid calling O(n) strlen() operation at every // PyUnicode_AsUTF8() call on the same object. - if (_PyUnicode_STATE(unicode).embed_null == EMBED_NULL_UNKNOWN) { - if (strlen(utf8) != (size_t)size) { - _PyUnicode_STATE(unicode).embed_null = 1; - } - else { - _PyUnicode_STATE(unicode).embed_null = 0; + if (_PyUnicode_STATE(unicode).embed_null != 0) { + if (_PyUnicode_STATE(unicode).embed_null == EMBED_NULL_UNKNOWN) { + if (strlen(utf8) != (size_t)size) { + _PyUnicode_STATE(unicode).embed_null = 1; + } + else { + _PyUnicode_STATE(unicode).embed_null = 0; + } } - } - if (_PyUnicode_STATE(unicode).embed_null == 1) { - PyErr_SetString(PyExc_ValueError, - "embedded null character"); - return NULL; + if (_PyUnicode_STATE(unicode).embed_null == 1) { + PyErr_SetString(PyExc_ValueError, + "embedded null character"); + return NULL; + } } return utf8; From 07975be323e9dec1de4b584830617c2f382b3c8d Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Nov 2023 00:23:57 +0100 Subject: [PATCH 8/9] Revert test unicode_fromstring() change --- Modules/_testcapi/unicode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 950b924694710f..a10183dddeca98 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -301,12 +301,7 @@ unicode_fromstring(PyObject *self, PyObject *arg) if (!PyArg_Parse(arg, "z#", &s, &size)) { return NULL; } - PyObject *unicode = PyUnicode_FromString(s); - if (unicode == NULL) { - return NULL; - } - assert(((PyASCIIObject*)unicode)->state.embed_null == 0); - return unicode; + return PyUnicode_FromString(s); } /* Test PyUnicode_FromKindAndData() */ From e3c6fa51d8c013e40f707b9d4bb9da9986f983dc Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Nov 2023 22:12:47 +0100 Subject: [PATCH 9/9] Set embed_null in more cases on new strings * unicode_char() * PyUnicode_FromWideChar(str, -1) * _PyUnicode_Copy() --- Objects/unicodeobject.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 58f2597d3816fe..369d4660e8339a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1795,6 +1795,8 @@ unicode_char(Py_UCS4 ch) assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); PyUnicode_4BYTE_DATA(unicode)[0] = ch; } + // ch >= 256 and so cannot be 0 + _PyUnicode_STATE(unicode).embed_null = 0; assert(_PyUnicode_CheckConsistency(unicode, 1)); return unicode; } @@ -1811,8 +1813,13 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) return NULL; } + unsigned int embed_null; if (size == -1) { size = wcslen(u); + embed_null = 0; + } + else { + embed_null = EMBED_NULL_UNKNOWN; } /* If the Unicode data is known at construction time, we can apply @@ -1877,6 +1884,7 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) default: Py_UNREACHABLE(); } + _PyUnicode_STATE(unicode).embed_null = embed_null; return unicode_result(unicode); } @@ -2232,6 +2240,7 @@ _PyUnicode_Copy(PyObject *unicode) memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), length * PyUnicode_KIND(unicode)); + _PyUnicode_STATE(copy).embed_null = _PyUnicode_STATE(unicode).embed_null; assert(_PyUnicode_CheckConsistency(copy, 1)); return copy; }