Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: remove Block access in the JSON writing code #41081

Merged
123 changes: 29 additions & 94 deletions pandas/_libs/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ typedef struct __PdBlockContext {
int ncols;
int transpose;

int *cindices; // frame column -> block column map
NpyArrContext **npyCtxts; // NpyArrContext for each column
} PdBlockContext;

Expand Down Expand Up @@ -294,7 +293,12 @@ static int is_simple_frame(PyObject *obj) {
if (!mgr) {
return 0;
}
int ret = (get_attr_length(mgr, "blocks") <= 1);
int ret;
if (PyObject_HasAttrString(mgr, "blocks")) {
ret = (get_attr_length(mgr, "blocks") <= 1);
} else {
ret = 0;
}

Py_DECREF(mgr);
return ret;
Expand Down Expand Up @@ -656,16 +660,10 @@ void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
}

void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
PyObject *obj, *blocks, *block, *values, *tmp;
PyArrayObject *locs;
PyObject *obj, *values, *arrays, *array;
PdBlockContext *blkCtxt;
NpyArrContext *npyarr;
Py_ssize_t i;
NpyIter *iter;
NpyIter_IterNextFunc *iternext;
npy_int64 **dataptr;
npy_int64 colIdx;
npy_intp idx;

obj = (PyObject *)_obj;

Expand All @@ -687,7 +685,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {

if (blkCtxt->ncols == 0) {
blkCtxt->npyCtxts = NULL;
blkCtxt->cindices = NULL;

GET_TC(tc)->iterNext = NpyArr_iterNextNone;
return;
Expand All @@ -701,104 +698,45 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
return;
}

blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols);
if (!blkCtxt->cindices) {
PyErr_NoMemory();
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
return;
}

blocks = get_sub_attr(obj, "_mgr", "blocks");
if (!blocks) {
arrays = get_sub_attr(obj, "_mgr", "column_arrays");
if (!arrays) {
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
return;
} else if (!PyTuple_Check(blocks)) {
PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!");
goto BLKRET;
}

// force transpose so each NpyArrContext strides down its column
GET_TC(tc)->transpose = 1;

for (i = 0; i < PyObject_Length(blocks); i++) {
block = PyTuple_GET_ITEM(blocks, i);
if (!block) {
for (i = 0; i < PyObject_Length(arrays); i++) {
array = PyList_GET_ITEM(arrays, i);
if (!array) {
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
goto BLKRET;
goto ARR_RET;
}

tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL);
if (!tmp) {
// ensure we have a numpy array (i.e. np.asarray)
values = PyObject_CallMethod(array, "__array__", NULL);
if ((!values) || (!PyArray_CheckExact(values))) {
// Didn't get a numpy array
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
goto BLKRET;
}

values = PyArray_Transpose((PyArrayObject *)tmp, NULL);
Py_DECREF(tmp);
if (!values) {
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
goto BLKRET;
}

locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array");
if (!locs) {
Py_DECREF(values);
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
goto BLKRET;
goto ARR_RET;
}

iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER,
NPY_NO_CASTING, NULL);
if (!iter) {
Py_DECREF(values);
Py_DECREF(locs);
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
goto BLKRET;
}
iternext = NpyIter_GetIterNext(iter, NULL);
if (!iternext) {
NpyIter_Deallocate(iter);
Py_DECREF(values);
Py_DECREF(locs);
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
goto BLKRET;
}
dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter);
do {
colIdx = **dataptr;
idx = NpyIter_GetIterIndex(iter);
GET_TC(tc)->newObj = values;

blkCtxt->cindices[colIdx] = idx;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if you remove this assignment to cindices you can remove this struct member altogether - not aware of it being written to anywhere else. Might help reduce confusion over state management here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, indeed, there are still some further things to clean up. And this cindices is now no longer used.

// init a dedicated context for this column
NpyArr_iterBegin(obj, tc);
npyarr = GET_TC(tc)->npyarr;

// Reference freed in Pdblock_iterend
Py_INCREF(values);
GET_TC(tc)->newObj = values;

// init a dedicated context for this column
NpyArr_iterBegin(obj, tc);
npyarr = GET_TC(tc)->npyarr;

// set the dataptr to our desired column and initialise
if (npyarr != NULL) {
npyarr->dataptr += npyarr->stride * idx;
NpyArr_iterNext(obj, tc);
}
GET_TC(tc)->itemValue = NULL;
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;

blkCtxt->npyCtxts[colIdx] = npyarr;
GET_TC(tc)->newObj = NULL;
} while (iternext(iter));
GET_TC(tc)->itemValue = NULL;
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;

NpyIter_Deallocate(iter);
Py_DECREF(values);
Py_DECREF(locs);
blkCtxt->npyCtxts[i] = npyarr;
GET_TC(tc)->newObj = NULL;
}
GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
goto ARR_RET;

BLKRET:
Py_DECREF(blocks);
ARR_RET:
Py_DECREF(arrays);
}

void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
Expand Down Expand Up @@ -830,9 +768,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
if (blkCtxt->npyCtxts) {
PyObject_Free(blkCtxt->npyCtxts);
}
if (blkCtxt->cindices) {
PyObject_Free(blkCtxt->cindices);
}
PyObject_Free(blkCtxt);
}
}
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,13 @@ def iget_values(self, i: int) -> ArrayLike:
"""
return self.arrays[i]

@property
def column_arrays(self) -> list[ArrayLike]:
"""
Used in the JSON C code to access column arrays.
"""
return self.arrays

def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
"""
Set new column(s).
Expand Down
8 changes: 0 additions & 8 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
# expected "ndarray")
return self.values # type: ignore[return-value]

@final
def get_block_values_for_json(self) -> np.ndarray:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think the "final" is out of date; same method on DatetimeLikeBlock

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is only a single get_block_values_for_json (no Block subclass overrides it)

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Apr 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or at least until a few hours ago, when your PR that changed this was merged ;) (You could have just as well said that you just changed this / pointed to #41082)

"""
This is used in the JSON C code.
"""
# TODO(EA2D): reshape will be unnecessary with 2D EAs
return np.asarray(self.values).reshape(self.shape)

@final
@cache_readonly
def fill_value(self):
Expand Down
18 changes: 18 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,6 +1137,24 @@ def iget_values(self, i: int) -> ArrayLike:
values = block.iget(self.blklocs[i])
return values

@property
def column_arrays(self) -> list[np.ndarray]:
"""
Used in the JSON C code to access column arrays.
This optimizes compared to using `iget_values` by converting each
block.values to a np.ndarray only once up front
"""
arrays = [np.asarray(blk.values) for blk in self.blocks]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this will change the behavior for dt64tz

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will not change behaviour, as it is the same as get_block_values_for_json did before until a few hours ago. The comment you added to the changed implementation in #41082 is "Not necessary to override, but helps perf", so assuming that that didn't change behaviour.

I would also assume it is covered by the tests, but checking explicitly:

Master:

In [1]: pd.DataFrame({'a': [1, 2], 'b': pd.date_range("2012", periods=2, tz="Europe/Brussels")}).to_json()
Out[1]: '{"a":{"0":1,"1":2},"b":{"0":1325372400000,"1":1325458800000}}'

In [2]: pd.DataFrame({'b': pd.date_range("2012", periods=2, tz="Europe/Brussels")}).to_json()
Out[2]: '{"b":{"0":1325372400000,"1":1325458800000}}'

PR:

In [1]: pd.DataFrame({'a': [1, 2], 'b': pd.date_range("2012", periods=2, tz="Europe/Brussels")}).to_json()
Out[1]: '{"a":{"0":1,"1":2},"b":{"0":1325372400000,"1":1325458800000}}'

In [2]: pd.DataFrame({'b': pd.date_range("2012", periods=2, tz="Europe/Brussels")}).to_json()
Out[2]: '{"b":{"0":1325372400000,"1":1325458800000}}'

Now, to preserve the performance improvement of #41082, I can add a check for datetimetz.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hopefully more accurate statement: i think this will entail an object-dtype conversion for dt64tz. I'm not especially bothered by this bc i think this is a good cleanup

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... which i now see you already addressed in a new commit. carry on

result = []
for i in range(len(self.items)):
arr = arrays[self.blknos[i]]
if arr.ndim == 2:
values = arr[self.blklocs[i]]
else:
values = arr
result.append(values)
return result

def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
"""
Set new item in-place. Does not consolidate. Adds new Block if not
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
Expand All @@ -26,8 +24,6 @@
set_default_names,
)

pytestmark = td.skip_array_manager_not_yet_implemented


class TestBuildSchema:
def setup_method(self, method):
Expand Down
2 changes: 0 additions & 2 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,8 +857,6 @@ def test_convert_dates_infer(self, infer_word):
result = read_json(dumps(data))[["id", infer_word]]
tm.assert_frame_equal(result, expected)

# TODO(ArrayManager) JSON
@td.skip_array_manager_not_yet_implemented
@pytest.mark.parametrize(
"date,date_unit",
[
Expand Down