From 17a165a29084b02b929e35c187c7f722aa03ae68 Mon Sep 17 00:00:00 2001 From: matthiashuschle Date: Sat, 14 Oct 2017 16:36:38 +0200 Subject: [PATCH] BUG: to_json - prevent various segfault conditions (GH14256) (#17857) --- doc/source/whatsnew/v0.21.0.txt | 4 +- pandas/_libs/src/ujson/lib/ultrajson.h | 7 ++++ pandas/_libs/src/ujson/lib/ultrajsonenc.c | 7 +--- pandas/_libs/src/ujson/python/objToJSON.c | 7 +++- pandas/tests/io/json/test_pandas.py | 45 +++++++++++++++++++++++ 5 files changed, 61 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 10e551df7326db..828692195f29e9 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -956,6 +956,7 @@ I/O - Bug in :meth:`DataFrame.to_html` with ``notebook=True`` where DataFrames with named indices or non-MultiIndex indices had undesired horizontal or vertical alignment for column or row labels, respectively (:issue:`16792`) - Bug in :meth:`DataFrame.to_html` in which there was no validation of the ``justify`` parameter (:issue:`17527`) - Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`) +- Bug in :func:`to_json` where several conditions (including objects with unprintable symbols, objects with deep recursion, overlong labels) caused segfaults instead of raising the appropriate exception (:issue:`14256`) Plotting ^^^^^^^^ @@ -1033,11 +1034,8 @@ Other ^^^^^ - Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`) - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) -<<<<<<< HEAD - The ``Series`` constructor with no arguments would have an index like ``Index([], dtype='object')`` instead of ``RangeIndex(start=0, stop=0, step=1)`` - Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) - Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) - Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) - The documentation has had references to versions < v0.17 removed and cleaned up (:issue:`17442`, :issue:`17442`, :issue:`17404` & :issue:`17504`) -======= ->>>>>>> DOC: whatsnew fixes (#17626) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 4f51fa8b3eb383..159645b4007e1c 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -307,4 +307,11 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); +#define Buffer_Reserve(__enc, __len) \ + if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ + Buffer_Realloc((__enc), (__len)); \ + } + +void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded); + #endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 6bf22977490063..2d6c823a45515e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -714,11 +714,6 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, } } -#define Buffer_Reserve(__enc, __len) \ - if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ - Buffer_Realloc((__enc), (__len)); \ - } - #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, @@ -976,6 +971,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } enc->iterEnd(obj, &tc); + Buffer_Reserve(enc, 2); Buffer_AppendCharUnchecked(enc, ']'); break; } @@ -1003,6 +999,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } enc->iterEnd(obj, &tc); + Buffer_Reserve(enc, 2); Buffer_AppendCharUnchecked(enc, '}'); break; } diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 1ee862b54cf0bc..ae7854dfc14278 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -783,6 +783,7 @@ static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; PRINTMARK(); *outLen = strlen(labels[idx]); + Buffer_Reserve(enc, *outLen); memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen)); enc->offset += *outLen; *outLen = 0; @@ -879,7 +880,7 @@ int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { NpyArrContext *npyarr; PRINTMARK(); - if (PyErr_Occurred()) { + if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { return 0; } @@ -1224,6 +1225,10 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { PyObject *attrName; char *attrStr; + if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { + return 0; + } + if (itemValue) { Py_DECREF(GET_TC(tc)->itemValue); GET_TC(tc)->itemValue = itemValue = NULL; diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index de4afec883efdb..6625446bea4693 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -511,6 +511,51 @@ def test_blocks_compat_GH9037(self): by_blocks=True, check_exact=True) + def test_frame_nonprintable_bytes(self): + # GH14256: failing column caused segfaults, if it is not the last one + + class BinaryThing(object): + + def __init__(self, hexed): + self.hexed = hexed + if compat.PY2: + self.binary = hexed.decode('hex') + else: + self.binary = bytes.fromhex(hexed) + + def __str__(self): + return self.hexed + + hexed = '574b4454ba8c5eb4f98a8f45' + binthing = BinaryThing(hexed) + + # verify the proper conversion of printable content + df_printable = DataFrame({'A': [binthing.hexed]}) + assert df_printable.to_json() == '{"A":{"0":"%s"}}' % hexed + + # check if non-printable content throws appropriate Exception + df_nonprintable = DataFrame({'A': [binthing]}) + with pytest.raises(OverflowError): + df_nonprintable.to_json() + + # the same with multiple columns threw segfaults + df_mixed = DataFrame({'A': [binthing], 'B': [1]}, + columns=['A', 'B']) + with pytest.raises(OverflowError): + df_mixed.to_json() + + # default_handler should resolve exceptions for non-string types + assert df_nonprintable.to_json(default_handler=str) == \ + '{"A":{"0":"%s"}}' % hexed + assert df_mixed.to_json(default_handler=str) == \ + '{"A":{"0":"%s"},"B":{"0":1}}' % hexed + + def test_label_overflow(self): + # GH14256: buffer length not checked when writing label + df = pd.DataFrame({'foo': [1337], 'bar' * 100000: [1]}) + assert df.to_json() == \ + '{"%s":{"0":1},"foo":{"0":1337}}' % ('bar' * 100000) + def test_series_non_unique_index(self): s = Series(['a', 'b'], index=[1, 1])