From 5d1aa087a9de3939cfd225fd14a5490ee3732eb4 Mon Sep 17 00:00:00 2001 From: matthiashuschle Date: Sat, 14 Oct 2017 16:36:38 +0200 Subject: [PATCH] BUG: to_json - prevent various segfault conditions (GH14256) (#17857) --- doc/source/whatsnew/v0.21.0.txt | 2 + pandas/_libs/src/ujson/lib/ultrajson.h | 7 ++++ pandas/_libs/src/ujson/lib/ultrajsonenc.c | 7 +--- pandas/_libs/src/ujson/python/objToJSON.c | 7 +++- pandas/tests/io/json/test_pandas.py | 45 +++++++++++++++++++++++ 5 files changed, 62 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index d6bdf153e03684..f4fbbd3596b57c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -956,6 +956,7 @@ I/O - Bug in :meth:`DataFrame.to_html` with ``notebook=True`` where DataFrames with named indices or non-MultiIndex indices had undesired horizontal or vertical alignment for column or row labels, respectively (:issue:`16792`) - Bug in :meth:`DataFrame.to_html` in which there was no validation of the ``justify`` parameter (:issue:`17527`) - Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`) +- Bug in :func:`to_json` where several conditions (including objects with unprintable symbols, objects with deep recursion, overlong labels) caused segfaults instead of raising the appropriate exception (:issue:`14256`) Plotting ^^^^^^^^ @@ -1033,3 +1034,4 @@ Other ^^^^^ - Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`) - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) + diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 4f51fa8b3eb383..159645b4007e1c 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -307,4 +307,11 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); +#define Buffer_Reserve(__enc, __len) \ + if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ + Buffer_Realloc((__enc), (__len)); \ + } + +void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded); + #endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 6bf22977490063..2d6c823a45515e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -714,11 +714,6 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, } } -#define Buffer_Reserve(__enc, __len) \ - if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ - Buffer_Realloc((__enc), (__len)); \ - } - #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, @@ -976,6 +971,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } enc->iterEnd(obj, &tc); + Buffer_Reserve(enc, 2); Buffer_AppendCharUnchecked(enc, ']'); break; } @@ -1003,6 +999,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } enc->iterEnd(obj, &tc); + Buffer_Reserve(enc, 2); Buffer_AppendCharUnchecked(enc, '}'); break; } diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 1ee862b54cf0bc..ae7854dfc14278 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -783,6 +783,7 @@ static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; PRINTMARK(); *outLen = strlen(labels[idx]); + Buffer_Reserve(enc, *outLen); memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen)); enc->offset += *outLen; *outLen = 0; @@ -879,7 +880,7 @@ int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { NpyArrContext *npyarr; PRINTMARK(); - if (PyErr_Occurred()) { + if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { return 0; } @@ -1224,6 +1225,10 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { PyObject *attrName; char *attrStr; + if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { + return 0; + } + if (itemValue) { Py_DECREF(GET_TC(tc)->itemValue); GET_TC(tc)->itemValue = itemValue = NULL; diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index de4afec883efdb..6625446bea4693 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -511,6 +511,51 @@ def test_blocks_compat_GH9037(self): by_blocks=True, check_exact=True) + def test_frame_nonprintable_bytes(self): + # GH14256: failing column caused segfaults, if it is not the last one + + class BinaryThing(object): + + def __init__(self, hexed): + self.hexed = hexed + if compat.PY2: + self.binary = hexed.decode('hex') + else: + self.binary = bytes.fromhex(hexed) + + def __str__(self): + return self.hexed + + hexed = '574b4454ba8c5eb4f98a8f45' + binthing = BinaryThing(hexed) + + # verify the proper conversion of printable content + df_printable = DataFrame({'A': [binthing.hexed]}) + assert df_printable.to_json() == '{"A":{"0":"%s"}}' % hexed + + # check if non-printable content throws appropriate Exception + df_nonprintable = DataFrame({'A': [binthing]}) + with pytest.raises(OverflowError): + df_nonprintable.to_json() + + # the same with multiple columns threw segfaults + df_mixed = DataFrame({'A': [binthing], 'B': [1]}, + columns=['A', 'B']) + with pytest.raises(OverflowError): + df_mixed.to_json() + + # default_handler should resolve exceptions for non-string types + assert df_nonprintable.to_json(default_handler=str) == \ + '{"A":{"0":"%s"}}' % hexed + assert df_mixed.to_json(default_handler=str) == \ + '{"A":{"0":"%s"},"B":{"0":1}}' % hexed + + def test_label_overflow(self): + # GH14256: buffer length not checked when writing label + df = pd.DataFrame({'foo': [1337], 'bar' * 100000: [1]}) + assert df.to_json() == \ + '{"%s":{"0":1},"foo":{"0":1337}}' % ('bar' * 100000) + def test_series_non_unique_index(self): s = Series(['a', 'b'], index=[1, 1])