Skip to content

Commit

Permalink
bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-2…
Browse files Browse the repository at this point in the history
…4843) (GH-24906)

Python no longer fails at startup with a fatal error if a command
line argument contains an invalid Unicode character.

The Py_DecodeLocale() function now escapes byte sequences which would
be decoded as Unicode characters outside the [U+0000; U+10ffff]
range.

Use MAX_UNICODE constant in unicodeobject.c.
(cherry picked from commit 9976834)

Co-authored-by: Victor Stinner <vstinner@python.org>

Co-authored-by: Victor Stinner <vstinner@python.org>
  • Loading branch information
miss-islington and vstinner authored Mar 29, 2021
1 parent e9092b2 commit 3b6e61e
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 69 deletions.
86 changes: 60 additions & 26 deletions Lib/test/test_cmd_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,38 +190,72 @@ def test_undecodable_code(self):
if not stdout.startswith(pattern):
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))

@unittest.skipIf(sys.platform == 'win32',
'Windows has a native unicode API')
def test_invalid_utf8_arg(self):
# bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba'
# byte sequence with surrogateescape rather than decoding it as the
# U+7fffbeba character which is outside the [U+0000; U+10ffff] range of
# Python Unicode characters.
#
# Test with default config, in the C locale, in the Python UTF-8 Mode.
code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))'
base_cmd = [sys.executable, '-c', code]

def run_default(arg):
cmd = [sys.executable, '-c', code, arg]
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)

def run_c_locale(arg):
cmd = [sys.executable, '-c', code, arg]
env = dict(os.environ)
env['LC_ALL'] = 'C'
return subprocess.run(cmd, stdout=subprocess.PIPE,
text=True, env=env)

def run_utf8_mode(arg):
cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)

valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
# invalid UTF-8 byte sequences with a valid UTF-8 sequence
# in the middle.
invalid_utf8 = (
b'\xff' # invalid byte
b'\xc3\xff' # invalid byte sequence
b'\xc3\xa9' # valid utf-8: U+00E9 character
b'\xed\xa0\x80' # lone surrogate character (invalid)
b'\xfd\xbf\xbf\xbb\xba\xba' # character outside [U+0000; U+10ffff]
)
test_args = [valid_utf8, invalid_utf8]

for run_cmd in (run_default, run_c_locale, run_utf8_mode):
with self.subTest(run_cmd=run_cmd):
for arg in test_args:
proc = run_cmd(arg)
self.assertEqual(proc.stdout.rstrip(), ascii(arg))

@unittest.skipUnless((sys.platform == 'darwin' or
support.is_android), 'test specific to Mac OS X and Android')
def test_osx_android_utf8(self):
def check_output(text):
decoded = text.decode('utf-8', 'surrogateescape')
expected = ascii(decoded).encode('ascii') + b'\n'
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
code = "import sys; print(ascii(sys.argv[1]))"

env = os.environ.copy()
# C locale gives ASCII locale encoding, but Python uses UTF-8
# to parse the command line arguments on Mac OS X and Android.
env['LC_ALL'] = 'C'
decoded = text.decode('utf-8', 'surrogateescape')
expected = ascii(decoded).encode('ascii') + b'\n'

p = subprocess.Popen(
(sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
stdout=subprocess.PIPE,
env=env)
stdout, stderr = p.communicate()
self.assertEqual(stdout, expected)
self.assertEqual(p.returncode, 0)
env = os.environ.copy()
# C locale gives ASCII locale encoding, but Python uses UTF-8
# to parse the command line arguments on Mac OS X and Android.
env['LC_ALL'] = 'C'

# test valid utf-8
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
check_output(text)

# test invalid utf-8
text = (
b'\xff' # invalid byte
b'\xc3\xa9' # valid utf-8 character
b'\xc3\xff' # invalid byte sequence
b'\xed\xa0\x80' # lone surrogate character (invalid)
)
check_output(text)
p = subprocess.Popen(
(sys.executable, "-c", code, text),
stdout=subprocess.PIPE,
env=env)
stdout, stderr = p.communicate()
self.assertEqual(stdout, expected)
self.assertEqual(p.returncode, 0)

def test_unbuffered_output(self):
# Test expected operation of the '-u' switch
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Python no longer fails at startup with a fatal error if a command line
argument contains an invalid Unicode character. The
:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be
decoded as Unicode characters outside the [U+0000; U+10ffff] range.
9 changes: 5 additions & 4 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
extern "C" {
#endif

/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
// The value must be the same in fileutils.c.
#define MAX_UNICODE 0x10ffff

#ifdef Py_DEBUG
Expand Down Expand Up @@ -1707,8 +1708,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
*maxchar = ch;
if (*maxchar > MAX_UNICODE) {
PyErr_Format(PyExc_ValueError,
"character U+%x is not in range [U+0000; U+10ffff]",
ch);
"character U+%x is not in range [U+0000; U+%x]",
ch, MAX_UNICODE);
return -1;
}
}
Expand Down Expand Up @@ -13610,7 +13611,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
{
case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
default:
Py_UNREACHABLE();
}
Expand Down
118 changes: 79 additions & 39 deletions Python/fileutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ extern int winerror_to_errno(int);
int _Py_open_cloexec_works = -1;
#endif

// The value must be the same in unicodeobject.c.
#define MAX_UNICODE 0x10ffff

// mbstowcs() and mbrtowc() errors
static const size_t DECODE_ERROR = ((size_t)-1);
static const size_t INCOMPLETE_CHARACTER = (size_t)-2;


static int
get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
Expand Down Expand Up @@ -85,6 +92,57 @@ _Py_device_encoding(int fd)
Py_RETURN_NONE;
}


static size_t
is_valid_wide_char(wchar_t ch)
{
if (Py_UNICODE_IS_SURROGATE(ch)) {
// Reject lone surrogate characters
return 0;
}
if (ch > MAX_UNICODE) {
// bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
// The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
// it creates characters outside the [U+0000; U+10ffff] range:
// https://sourceware.org/bugzilla/show_bug.cgi?id=2373
return 0;
}
return 1;
}


static size_t
_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
{
size_t count = mbstowcs(dest, src, n);
if (dest != NULL && count != DECODE_ERROR) {
for (size_t i=0; i < count; i++) {
wchar_t ch = dest[i];
if (!is_valid_wide_char(ch)) {
return DECODE_ERROR;
}
}
}
return count;
}


#ifdef HAVE_MBRTOWC
static size_t
_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
{
assert(pwc != NULL);
size_t count = mbrtowc(pwc, str, len, pmbs);
if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
if (!is_valid_wide_char(*pwc)) {
return DECODE_ERROR;
}
}
return count;
}
#endif


#if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)

#define USE_FORCE_ASCII
Expand Down Expand Up @@ -151,8 +209,8 @@ check_force_ascii(void)
size_t res;

ch = (unsigned char)0xA7;
res = mbstowcs(&wch, (char*)&ch, 1);
if (res != (size_t)-1 && wch == L'\xA7') {
res = _Py_mbstowcs(&wch, (char*)&ch, 1);
if (res != DECODE_ERROR && wch == L'\xA7') {
/* On HP-UX withe C locale or the POSIX locale,
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
Latin1 encoding in practice. Force ASCII in this case.
Expand Down Expand Up @@ -199,8 +257,8 @@ check_force_ascii(void)

unsigned uch = (unsigned char)i;
ch[0] = (char)uch;
res = mbstowcs(wch, ch, 1);
if (res != (size_t)-1) {
res = _Py_mbstowcs(wch, ch, 1);
if (res != DECODE_ERROR) {
/* decoding a non-ASCII character from the locale encoding succeed:
the locale encoding is not ASCII, force ASCII */
return 1;
Expand Down Expand Up @@ -390,9 +448,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
*/
argsize = strlen(arg);
#else
argsize = mbstowcs(NULL, arg, 0);
argsize = _Py_mbstowcs(NULL, arg, 0);
#endif
if (argsize != (size_t)-1) {
if (argsize != DECODE_ERROR) {
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
return -1;
}
Expand All @@ -401,21 +459,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
return -1;
}

count = mbstowcs(res, arg, argsize + 1);
if (count != (size_t)-1) {
wchar_t *tmp;
/* Only use the result if it contains no
surrogate characters. */
for (tmp = res; *tmp != 0 &&
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
;
if (*tmp == 0) {
if (wlen != NULL) {
*wlen = count;
}
*wstr = res;
return 0;
count = _Py_mbstowcs(res, arg, argsize + 1);
if (count != DECODE_ERROR) {
*wstr = res;
if (wlen != NULL) {
*wlen = count;
}
return 0;
}
PyMem_RawFree(res);
}
Expand All @@ -439,46 +489,36 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
out = res;
memset(&mbs, 0, sizeof mbs);
while (argsize) {
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
if (converted == 0) {
/* Reached end of string; null char stored. */
break;
}

if (converted == (size_t)-2) {
if (converted == INCOMPLETE_CHARACTER) {
/* Incomplete character. This should never happen,
since we provide everything that we have -
unless there is a bug in the C library, or I
misunderstood how mbrtowc works. */
goto decode_error;
}

if (converted == (size_t)-1) {
if (converted == DECODE_ERROR) {
if (!surrogateescape) {
goto decode_error;
}

/* Conversion error. Escape as UTF-8b, and start over
in the initial shift state. */
/* Decoding error. Escape as UTF-8b, and start over in the initial
shift state. */
*out++ = 0xdc00 + *in++;
argsize--;
memset(&mbs, 0, sizeof mbs);
continue;
}

if (Py_UNICODE_IS_SURROGATE(*out)) {
if (!surrogateescape) {
goto decode_error;
}
// _Py_mbrtowc() reject lone surrogate characters
assert(!Py_UNICODE_IS_SURROGATE(*out));

/* Surrogate character. Escape the original
byte sequence with surrogateescape. */
argsize -= converted;
while (converted--) {
*out++ = 0xdc00 + *in++;
}
continue;
}
/* successfully converted some bytes */
in += converted;
argsize -= converted;
Expand Down Expand Up @@ -655,7 +695,7 @@ encode_current_locale(const wchar_t *text, char **str,
else {
converted = wcstombs(NULL, buf, 0);
}
if (converted == (size_t)-1) {
if (converted == DECODE_ERROR) {
goto encode_error;
}
if (bytes != NULL) {
Expand Down Expand Up @@ -1371,7 +1411,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
char cmode[10];
size_t r;
r = wcstombs(cmode, mode, 10);
if (r == (size_t)-1 || r >= 10) {
if (r == DECODE_ERROR || r >= 10) {
errno = EINVAL;
return NULL;
}
Expand Down

0 comments on commit 3b6e61e

Please sign in to comment.