From 58c698fa255daffc4d9718eb1a5e51212e126aa0 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 12:36:58 -0800 Subject: [PATCH 1/4] API: pseudo-public internals API for downstream libraries --- pandas/core/internals/__init__.py | 4 +- pandas/core/internals/api.py | 61 ++++++++++++++++++++++++ pandas/tests/internals/test_internals.py | 2 +- 3 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 pandas/core/internals/api.py diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 054ce8a40288b..672366ccdcc4d 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,6 +1,7 @@ +from pandas.core.internals.api import make_block # pseudo-public version from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.base import DataManager -from pandas.core.internals.blocks import ( # io.pytables, io.packers +from pandas.core.internals.blocks import ( # io.pytables Block, CategoricalBlock, DatetimeBlock, @@ -10,7 +11,6 @@ NumericBlock, ObjectBlock, TimeDeltaBlock, - make_block, ) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py new file mode 100644 index 0000000000000..b4705dc84add4 --- /dev/null +++ b/pandas/core/internals/api.py @@ -0,0 +1,61 @@ +""" +This is a pseudo-public API for downstream libraries. We ask that downstream +authors + +1) Try to avoid using internals directly altogether, and failing that, +2) Use only functions exposed here (or in core.internals) + +""" +from typing import Optional + +import numpy as np + +from pandas._typing import Dtype + +from pandas.core.dtypes.common import is_datetime64tz_dtype +from pandas.core.dtypes.dtypes import PandasDtype +from pandas.core.dtypes.generic import ABCPandasArray + +from pandas.core.arrays import DatetimeArray +from pandas.core.internals.blocks import ( + Block, + DatetimeTZBlock, + get_block_type, +) + + +def make_block( + values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None +) -> Block: + """ + This is a pseudo-public analogue to blocks.make_block. + + We ask that downstream libraries use this rather than any fully-internal + APIs, including but not limited to: + + - core.internals.blocks.make_block + - Block.make_block + - Block.make_block_same_class + - Block.__init__ + """ + if isinstance(values, ABCPandasArray): + # Ensure that we don't allow PandasArray / PandasDtype in internals. + # For now, blocks should be backed by ndarrays when possible. + values = values.to_numpy() + if ndim and ndim > 1: + # TODO(EA2D): special case not needed with 2D EAs + values = np.atleast_2d(values) + + if isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + + if klass is None: + dtype = dtype or values.dtype + klass = get_block_type(values, dtype) + + elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype): + # TODO: This is no longer hit internally; does it need to be retained + # for e.g. pyarrow? + values = DatetimeArray._simple_new(values, dtype=dtype) + + return klass(values, ndim=ndim, placement=placement) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 54130bb075666..ff20d8a4e5b19 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -34,8 +34,8 @@ from pandas.core.internals import ( BlockManager, SingleBlockManager, - make_block, ) +from pandas.core.internals.blocks import make_block # private version @pytest.fixture From 723310152474ea96950920842ef3213fcb95f051 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Mar 2021 08:09:40 -0800 Subject: [PATCH 2/4] make_block -> new_block --- pandas/core/internals/api.py | 2 +- pandas/core/internals/array_manager.py | 7 +++--- pandas/core/internals/blocks.py | 10 ++++---- pandas/core/internals/concat.py | 6 ++--- pandas/core/internals/construction.py | 4 ++-- pandas/core/internals/managers.py | 32 +++++++++++++------------- 6 files changed, 31 insertions(+), 30 deletions(-) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index b4705dc84add4..3fbe324417c60 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -28,7 +28,7 @@ def make_block( values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None ) -> Block: """ - This is a pseudo-public analogue to blocks.make_block. + This is a pseudo-public analogue to blocks.new_block. We ask that downstream libraries use this rather than any fully-internal APIs, including but not limited to: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 998f1ffcf02ee..e5a518d1d9875 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -72,7 +72,7 @@ ensure_index, ) from pandas.core.internals.base import DataManager -from pandas.core.internals.blocks import make_block +from pandas.core.internals.blocks import new_block if TYPE_CHECKING: from pandas.core.internals.managers import SingleBlockManager @@ -422,7 +422,8 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: arr = arr._data # type: ignore[union-attr] if isinstance(arr, np.ndarray): arr = np.atleast_2d(arr) - block = make_block(arr, placement=slice(0, 1, 1), ndim=2) + + block = new_block(arr, placement=slice(0, 1, 1), ndim=2) applied = getattr(block, f)(**kwargs) if isinstance(applied, list): applied = applied[0] @@ -741,7 +742,7 @@ def iget(self, i: int) -> SingleBlockManager: from pandas.core.internals.managers import SingleBlockManager values = self.arrays[i] - block = make_block(values, placement=slice(0, len(values)), ndim=1) + block = new_block(values, placement=slice(0, len(values)), ndim=1) return SingleBlockManager(block, self._axes[0]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f2b8499a316b7..63bc203f552c7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -314,7 +314,7 @@ def make_block(self, values, placement=None) -> Block: if self.is_extension: values = ensure_block_shape(values, ndim=self.ndim) - return make_block(values, placement=placement, ndim=self.ndim) + return new_block(values, placement=placement, ndim=self.ndim) @final def make_block_same_class(self, values, placement=None) -> Block: @@ -1434,7 +1434,7 @@ def _unstack(self, unstacker, fill_value, new_placement): new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [make_block(new_values, placement=new_placement, ndim=2)] + blocks = [new_block(new_values, placement=new_placement, ndim=2)] return blocks, mask def quantile( @@ -1467,7 +1467,7 @@ def quantile( result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) - return make_block(result, placement=self.mgr_locs, ndim=2) + return new_block(result, placement=self.mgr_locs, ndim=2) class ExtensionBlock(Block): @@ -1855,7 +1855,7 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: assert result.shape == (1, len(qs)), result.shape result = type(self.values)._from_factorized(result[0], self.values) - return make_block(result, placement=self.mgr_locs, ndim=2) + return new_block(result, placement=self.mgr_locs, ndim=2) class HybridMixin: @@ -2322,7 +2322,7 @@ def get_block_type(values, dtype: Optional[Dtype] = None): return cls -def make_block( +def new_block( values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None ) -> Block: # Ensure that we don't allow PandasArray / PandasDtype in internals. diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index a71fdff043212..0803e40a219be 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -42,7 +42,7 @@ ExtensionArray, ) from pandas.core.internals.array_manager import ArrayManager -from pandas.core.internals.blocks import make_block +from pandas.core.internals.blocks import new_block from pandas.core.internals.managers import BlockManager if TYPE_CHECKING: @@ -144,10 +144,10 @@ def concatenate_managers( # Fast-path b = blk.make_block_same_class(values, placement=placement) else: - b = make_block(values, placement=placement, ndim=blk.ndim) + b = new_block(values, placement=placement, ndim=blk.ndim) else: new_values = _concatenate_join_units(join_units, concat_axis, copy=copy) - b = make_block(new_values, placement=placement, ndim=len(axes)) + b = new_block(new_values, placement=placement, ndim=len(axes)) blocks.append(b) return BlockManager(blocks, axes) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d49114c0da719..786f0fd62361d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -292,11 +292,11 @@ def ndarray_to_mgr( if isinstance(dvals_list[n], np.ndarray): dvals_list[n] = dvals_list[n].reshape(1, -1) - from pandas.core.internals.blocks import make_block + from pandas.core.internals.blocks import new_block # TODO: What about re-joining object columns? block_values = [ - make_block(dvals_list[n], placement=[n], ndim=2) + new_block(dvals_list[n], placement=[n], ndim=2) for n in range(len(dvals_list)) ] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2ad7471d6f086..44d4636109208 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -71,7 +71,7 @@ ensure_block_shape, extend_blocks, get_block_type, - make_block, + new_block, ) from pandas.core.internals.ops import ( blockwise_all, @@ -319,7 +319,7 @@ def unpickle_block(values, mgr_locs, ndim: int): # TODO(EA2D): ndim would be unnecessary with 2D EAs # older pickles may store e.g. DatetimeIndex instead of DatetimeArray values = extract_array(values, extract_numpy=True) - return make_block(values, placement=mgr_locs, ndim=ndim) + return new_block(values, placement=mgr_locs, ndim=ndim) if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: state = state[3]["0.14.1"] @@ -1155,7 +1155,7 @@ def value_getitem(placement): # one item. # TODO(EA2D): special casing unnecessary with 2D EAs new_blocks.extend( - make_block( + new_block( values=value, ndim=self.ndim, placement=slice(mgr_loc, mgr_loc + 1), @@ -1171,7 +1171,7 @@ def value_getitem(placement): unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) new_blocks.append( - make_block( + new_block( values=value_getitem(unfit_val_items), ndim=self.ndim, placement=unfit_mgr_locs, @@ -1216,7 +1216,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False value = ensure_block_shape(value, ndim=2) # TODO: type value as ArrayLike - block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) + block = new_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self.blknos[loc:]): blk = self.blocks[blkno] @@ -1443,7 +1443,7 @@ def _make_na_block(self, placement, fill_value=None): dtype, fill_value = infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) - return make_block(block_values, placement=placement, ndim=block_values.ndim) + return new_block(block_values, placement=placement, ndim=block_values.ndim) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ @@ -1569,7 +1569,7 @@ def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: """ Constructor for if we have an array that is not yet a Block. """ - block = make_block(array, placement=slice(0, len(index)), ndim=1) + block = new_block(array, placement=slice(0, len(index)), ndim=1) return cls(block, index) def _post_setstate(self): @@ -1671,7 +1671,7 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. blocks = [ - make_block( + new_block( values=blocks[0], placement=slice(0, len(axes[0])), ndim=2 ) ] @@ -1782,7 +1782,7 @@ def _form_blocks( if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) + new_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) for i, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1793,14 +1793,14 @@ def _form_blocks( if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ - make_block(array, klass=CategoricalBlock, placement=i, ndim=2) + new_block(array, klass=CategoricalBlock, placement=i, ndim=2) for i, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) if len(items_dict["ExtensionBlock"]): external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=i, ndim=2) + new_block(array, klass=ExtensionBlock, placement=i, ndim=2) for i, array in items_dict["ExtensionBlock"] ] @@ -1808,7 +1808,7 @@ def _form_blocks( if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ - make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) + new_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) for i, array in items_dict["ObjectValuesExtensionBlock"] ] @@ -1821,7 +1821,7 @@ def _form_blocks( block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) - na_block = make_block(block_values, placement=extra_locs, ndim=2) + na_block = new_block(block_values, placement=extra_locs, ndim=2) blocks.append(na_block) return blocks @@ -1838,7 +1838,7 @@ def _simple_blockify(tuples, dtype) -> List[Block]: if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - block = make_block(values, placement=placement, ndim=2) + block = new_block(values, placement=placement, ndim=2) return [block] @@ -1852,7 +1852,7 @@ def _multi_blockify(tuples, dtype: Optional[Dtype] = None): values, placement = _stack_arrays(list(tup_block), dtype) - block = make_block(values, placement=placement, ndim=2) + block = new_block(values, placement=placement, ndim=2) new_blocks.append(block) return new_blocks @@ -1930,7 +1930,7 @@ def _merge_blocks( new_values = new_values[argsort] new_mgr_locs = new_mgr_locs[argsort] - return [make_block(new_values, placement=new_mgr_locs, ndim=2)] + return [new_block(new_values, placement=new_mgr_locs, ndim=2)] # can't consolidate --> no merge return blocks From 76425fc562008a615e7cfe22dff66dc698f0abba Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Mar 2021 08:33:26 -0800 Subject: [PATCH 3/4] TST: test both new_block and make_block --- pandas/tests/internals/test_internals.py | 49 +++++++++++++++--------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ff20d8a4e5b19..683006d9b3b9c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -34,8 +34,17 @@ from pandas.core.internals import ( BlockManager, SingleBlockManager, + make_block, ) -from pandas.core.internals.blocks import make_block # private version +from pandas.core.internals.blocks import new_block + + +@pytest.fixture(params=[new_block, make_block]) +def block_maker(request): + """ + Fixture to test both the internal new_block and pseudo-public make_block. + """ + return request.param @pytest.fixture @@ -65,7 +74,7 @@ def get_numeric_mat(shape): N = 10 -def create_block(typestr, placement, item_shape=None, num_offset=0): +def create_block(typestr, placement, item_shape=None, num_offset=0, maker=new_block): """ Supported typestr: @@ -147,7 +156,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): else: raise ValueError(f'Unsupported typestr: "{typestr}"') - return make_block(values, placement=placement, ndim=len(shape)) + return maker(values, placement=placement, ndim=len(shape)) def create_single_mgr(typestr, num_rows=None): @@ -290,7 +299,7 @@ def test_delete(self): def test_split(self): # GH#37799 values = np.random.randn(3, 4) - blk = make_block(values, placement=[3, 1, 6], ndim=2) + blk = new_block(values, placement=[3, 1, 6], ndim=2) result = blk._split() # check that we get views, not copies @@ -299,9 +308,9 @@ def test_split(self): assert len(result) == 3 expected = [ - make_block(values[[0]], placement=[3], ndim=2), - make_block(values[[1]], placement=[1], ndim=2), - make_block(values[[2]], placement=[6], ndim=2), + new_block(values[[0]], placement=[3], ndim=2), + new_block(values[[1]], placement=[1], ndim=2), + new_block(values[[2]], placement=[6], ndim=2), ] for res, exp in zip(result, expected): assert_block_equal(res, exp) @@ -365,7 +374,7 @@ def test_categorical_block_pickle(self): def test_iget(self): cols = Index(list("abc")) values = np.random.rand(3, 3) - block = make_block( + block = new_block( values=values.copy(), placement=np.arange(3), ndim=values.ndim ) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) @@ -1149,7 +1158,7 @@ def test_datetime_block_can_hold_element(self): def test_interval_can_hold_element_emptylist(self, dtype, element): arr = np.array([1, 3, 4], dtype=dtype) ii = IntervalIndex.from_breaks(arr) - blk = make_block(ii._data, [1], ndim=2) + blk = new_block(ii._data, [1], ndim=2) assert blk._can_hold_element([]) # TODO: check this holds for all blocks @@ -1158,7 +1167,7 @@ def test_interval_can_hold_element_emptylist(self, dtype, element): def test_interval_can_hold_element(self, dtype, element): arr = np.array([1, 3, 4, 9], dtype=dtype) ii = IntervalIndex.from_breaks(arr) - blk = make_block(ii._data, [1], ndim=2) + blk = new_block(ii._data, [1], ndim=2) elem = element(ii) self.check_series_setitem(elem, ii, True) @@ -1183,7 +1192,7 @@ def test_interval_can_hold_element(self, dtype, element): def test_period_can_hold_element_emptylist(self): pi = period_range("2016", periods=3, freq="A") - blk = make_block(pi._data, [1], ndim=2) + blk = new_block(pi._data, [1], ndim=2) assert blk._can_hold_element([]) @@ -1278,18 +1287,18 @@ def test_should_store_categorical(self): ("sparse", SparseArray), ], ) -def test_holder(typestr, holder): - blk = create_block(typestr, [1]) +def test_holder(typestr, holder, block_maker): + blk = create_block(typestr, [1], maker=block_maker) assert blk._holder is holder -def test_validate_ndim(): +def test_validate_ndim(block_maker): values = np.array([1.0, 2.0]) placement = slice(2) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" with pytest.raises(ValueError, match=msg): - make_block(values, placement, ndim=2) + block_maker(values, placement, ndim=2) def test_block_shape(): @@ -1300,22 +1309,24 @@ def test_block_shape(): assert a._mgr.blocks[0].mgr_locs.indexer == b._mgr.blocks[0].mgr_locs.indexer -def test_make_block_no_pandas_array(): +def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype - result = make_block(arr, slice(len(arr)), ndim=arr.ndim) + result = block_maker(arr, slice(len(arr)), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # PandasArray, PandasDtype - result = make_block(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # ndarray, PandasDtype - result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False From 9155137dc3bc2086963f8cc4781aed2dcb3c3e60 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 20:55:15 -0800 Subject: [PATCH 4/4] test_namespace --- pandas/tests/internals/test_api.py | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 035607611ae80..d4630b20db85f 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -9,3 +9,42 @@ def test_internals_api(): assert internals.make_block is api.make_block + + +def test_namespace(): + # SUBJECT TO CHANGE + + modules = [ + "blocks", + "concat", + "managers", + "construction", + "array_manager", + "base", + "api", + "ops", + ] + expected = [ + "Block", + "CategoricalBlock", + "NumericBlock", + "DatetimeBlock", + "DatetimeTZBlock", + "ExtensionBlock", + "FloatBlock", + "ObjectBlock", + "TimeDeltaBlock", + "make_block", + "DataManager", + "ArrayManager", + "BlockManager", + "SingleDataManager", + "SingleBlockManager", + "SingleArrayManager", + "concatenate_managers", + "create_block_manager_from_arrays", + "create_block_manager_from_blocks", + ] + + result = [x for x in dir(internals) if not x.startswith("__")] + assert set(result) == set(expected + modules)