From 817b7069bd9fc014232c066dc79dafbf5463137e Mon Sep 17 00:00:00 2001 From: Tolker-KU <55140581+Tolker-KU@users.noreply.github.com> Date: Tue, 14 Jan 2025 19:00:43 +0100 Subject: [PATCH] ENH: Format `decimal.Decimal` as full precision strings in `.to_json(...)` (#60698) * Format decimal.Decimal as full precision strings in .to_json(...) * Fix failing tests * Clean up Decimal to utf8 convertion and switch to using PyObject_Format() to suppress scientific notation * Add whatsnew entry --- doc/source/whatsnew/v3.0.0.rst | 1 + .../src/vendored/ujson/python/objToJSON.c | 35 +++++++++++++++++-- .../json/test_json_table_schema_ext_dtype.py | 4 +-- pandas/tests/io/json/test_pandas.py | 7 +--- pandas/tests/io/json/test_ujson.py | 30 ++++++++-------- 5 files changed, 52 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1ca0bb9c653a4..bf1b52d3a0957 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -53,6 +53,7 @@ Other enhancements - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) +- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 6b957148f94da..4adc32ba0fed9 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -373,6 +373,27 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { return outValue; } +static char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, + size_t *len) { + PyObject *obj = (PyObject *)_obj; + PyObject *format_spec = PyUnicode_FromStringAndSize("f", 1); + PyObject *str = PyObject_Format(obj, format_spec); + Py_DECREF(format_spec); + + if (str == NULL) { + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + + GET_TC(tc)->newObj = str; + + Py_ssize_t s_len; + char *outValue = (char *)PyUnicode_AsUTF8AndSize(str, &s_len); + *len = s_len; + + return outValue; +} + //============================================================================= // Numpy array iteration functions //============================================================================= @@ -1467,8 +1488,18 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (object_is_decimal_type(obj)) { - pc->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; + PyObject *is_nan_py = PyObject_RichCompare(obj, obj, Py_NE); + if (is_nan_py == NULL) { + goto INVALID; + } + int is_nan = (is_nan_py == Py_True); + Py_DECREF(is_nan_py); + if (is_nan) { + tc->type = JT_NULL; + return; + } + pc->PyTypeToUTF8 = PyDecimalToUTF8Callback; + tc->type = JT_UTF8; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index 8de289afe9ff9..12ae24b064c9d 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -159,7 +159,7 @@ def test_build_decimal_series(self, dc): expected = OrderedDict( [ ("schema", schema), - ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ("data", [OrderedDict([("id", 0), ("a", "10")])]), ] ) @@ -245,7 +245,7 @@ def test_to_json(self, da, dc, sa, ia): [ ("idx", 0), ("A", "2021-10-10T00:00:00.000"), - ("B", 10.0), + ("B", "10"), ("C", "pandas"), ("D", 10), ] diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ad9dbf7554a8b..59997d52179e6 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,5 @@ import datetime from datetime import timedelta -from decimal import Decimal from io import StringIO import json import os @@ -2025,12 +2024,8 @@ def test_to_s3(self, s3_public_bucket, s3so): timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" - def test_json_pandas_nulls(self, nulls_fixture, request): + def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 - if isinstance(nulls_fixture, Decimal): - mark = pytest.mark.xfail(reason="not implemented") - request.applymarker(mark) - expected_warning = None msg = ( "The default 'epoch' date format is deprecated and will be removed " diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 62118f1c82ebb..c5ccc3b3f7184 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -57,56 +57,56 @@ def test_encode_decimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.ujson_dumps(sut, double_precision=15) decoded = ujson.ujson_loads(encoded) - assert decoded == 1337.1337 + assert decoded == "1337.1337" sut = decimal.Decimal("0.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "1.0" + assert encoded == '"0.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.95" sut = decimal.Decimal("0.94") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "0.9" + assert encoded == '"0.94"' decoded = ujson.ujson_loads(encoded) - assert decoded == 0.9 + assert decoded == "0.94" sut = decimal.Decimal("1.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "2.0" + assert encoded == '"1.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == 2.0 + assert decoded == "1.95" sut = decimal.Decimal("-1.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "-2.0" + assert encoded == '"-1.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == -2.0 + assert decoded == "-1.95" sut = decimal.Decimal("0.995") encoded = ujson.ujson_dumps(sut, double_precision=2) - assert encoded == "1.0" + assert encoded == '"0.995"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.995" sut = decimal.Decimal("0.9995") encoded = ujson.ujson_dumps(sut, double_precision=3) - assert encoded == "1.0" + assert encoded == '"0.9995"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.9995" sut = decimal.Decimal("0.99999999999999944") encoded = ujson.ujson_dumps(sut, double_precision=15) - assert encoded == "1.0" + assert encoded == '"0.99999999999999944"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.99999999999999944" @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii):