Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ArrayManager] DataFrame constructors #39991

Merged
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
61983d8
[ArrayManager] DataFrame constructors
jorisvandenbossche Feb 23, 2021
1d0315f
clean-up signatures
jorisvandenbossche Feb 23, 2021
ffc8314
'fix' for PandasArrays
jorisvandenbossche Feb 23, 2021
46e73c8
tests
jorisvandenbossche Feb 23, 2021
3e108df
ensure datetime-like array
jorisvandenbossche Feb 23, 2021
854bb17
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 24, 2021
8726d42
small clean-up - additional comments
jorisvandenbossche Feb 24, 2021
6e17183
use string join for msg
jorisvandenbossche Feb 24, 2021
8096665
add github issue link to comment
jorisvandenbossche Feb 24, 2021
aef4cc8
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 25, 2021
9c0a3d6
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 25, 2021
1eb5cb7
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 26, 2021
0992e67
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 26, 2021
936b290
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Mar 1, 2021
54d36ab
move wrapping inside ArrayManager constructor
jorisvandenbossche Mar 1, 2021
c56ffa8
remove skip
jorisvandenbossche Mar 1, 2021
164387c
trigger ci
jorisvandenbossche Mar 1, 2021
143b572
add skip for rename copy
jorisvandenbossche Mar 1, 2021
6166927
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Mar 2, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ jobs:
run: |
source activate pandas-dev
pytest pandas/tests/frame/methods --array-manager
pytest pandas/tests/frame/test_constructors.py --array-manager
pytest pandas/tests/frame/constructors/ --array-manager
pytest pandas/tests/frame/test_reductions.py --array-manager
pytest pandas/tests/reductions/ --array-manager
pytest pandas/tests/generic/test_generic.py --array-manager
Expand Down
62 changes: 44 additions & 18 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,41 +564,59 @@ def __init__(
if isinstance(data, DataFrame):
data = data._mgr

if isinstance(data, (BlockManager, ArrayManager)):
if index is None and columns is None and dtype is None and copy is False:
# GH#33357 fastpath
NDFrame.__init__(self, data)
return
# first check if a Manager is passed without any other arguments
# -> use fastpath (without checking Manager type)
if (
index is None
and columns is None
and dtype is None
and copy is False
and isinstance(data, (BlockManager, ArrayManager))
jreback marked this conversation as resolved.
Show resolved Hide resolved
):
# GH#33357 fastpath
NDFrame.__init__(self, data)
return

manager = get_option("mode.data_manager")

if isinstance(data, (BlockManager, ArrayManager)):
jreback marked this conversation as resolved.
Show resolved Hide resolved
mgr = self._init_mgr(
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
)

elif isinstance(data, dict):
mgr = init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(data, index, columns, dtype=dtype, typ=manager)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you rename typ -> manager?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note this is the same keyword as I use for arrays_to_mgr, mgr_to_mgr, etc, where I think the typ keyword makes sense (it's already clear from the name of the function that it is creating a manager).

Alternatively, we could make the init_dict function name more explicit (and consistent with the others) and eg rename to dict_to_mgr. I am happy to do this in a separate PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok yeah +1 on the rename (followon ok ) and anything to make the keyword more obvious

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-> #40074

elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords

# masked recarray
if isinstance(data, mrecords.MaskedRecords):
mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
mgr = masked_rec_array_to_mgr(
data, index, columns, dtype, copy, typ=manager
)

# a masked array
else:
data = sanitize_masked_array(data)
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)

elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
mgr = init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(data, index, columns, dtype=dtype, typ=manager)
elif getattr(data, "name", None) is not None:
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
mgr = init_dict(
{data.name: data}, index, columns, dtype=dtype, typ=manager
)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)

# For data is list-like, or Iterable (will consume into list)
elif is_list_like(data):
Expand All @@ -611,11 +629,15 @@ def __init__(
arrays, columns, index = nested_data_to_arrays(
data, columns, index, dtype
)
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
mgr = arrays_to_mgr(
arrays, columns, index, columns, dtype=dtype, typ=manager
)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)
else:
mgr = init_dict({}, index, columns, dtype=dtype)
mgr = init_dict({}, index, columns, dtype=dtype, typ=manager)
# For data is scalar
else:
if index is None or columns is None:
Expand All @@ -632,18 +654,19 @@ def __init__(
construct_1d_arraylike_from_scalar(data, len(index), dtype)
for _ in range(len(columns))
]
mgr = arrays_to_mgr(values, columns, index, columns, dtype=None)
mgr = arrays_to_mgr(
values, columns, index, columns, dtype=None, typ=manager
)
else:
values = construct_2d_arraylike_from_scalar(
data, len(index), len(columns), dtype, copy
)

mgr = init_ndarray(
values, index, columns, dtype=values.dtype, copy=False
values, index, columns, dtype=values.dtype, copy=False, typ=manager
)

# ensure correct Manager type according to settings
manager = get_option("mode.data_manager")
mgr = mgr_to_mgr(mgr, typ=manager)

NDFrame.__init__(self, mgr)
Expand Down Expand Up @@ -1971,7 +1994,8 @@ def from_records(
arr_columns = arr_columns.drop(arr_exclude)
columns = columns.drop(exclude)

mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
manager = get_option("mode.data_manager")
mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager)

return cls(mgr)

Expand Down Expand Up @@ -2178,13 +2202,15 @@ def _from_arrays(
if dtype is not None:
dtype = pandas_dtype(dtype)

manager = get_option("mode.data_manager")
mgr = arrays_to_mgr(
arrays,
columns,
index,
columns,
dtype=dtype,
verify_integrity=verify_integrity,
typ=manager,
)
return cls(mgr)

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@
ArrayManager,
BlockManager,
)
from pandas.core.internals.construction import mgr_to_mgr
from pandas.core.missing import find_valid_index
from pandas.core.ops import align_method_FRAME
from pandas.core.reshape.concat import concat
Expand Down Expand Up @@ -5752,6 +5753,8 @@ def _to_dict_of_blocks(self, copy: bool_t = True):
Internal ONLY - only works for BlockManager
"""
mgr = self._mgr
# convert to BlockManager if needed -> this way support ArrayManager as well
mgr = mgr_to_mgr(mgr, "block")
mgr = cast(BlockManager, mgr)
return {
k: self._constructor(v).__finalize__(self)
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,7 +816,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False

value = extract_array(value, extract_numpy=True)
if value.ndim == 2:
value = value[0, :]
if value.shape[0] == 1:
jreback marked this conversation as resolved.
Show resolved Hide resolved
value = value[0, :]
else:
raise ValueError(
f"Expected a 1D array, got an array with shape {value.shape}"
)

# TODO self.arrays can be empty
# assert len(value) == len(self.arrays[0])

Expand Down
42 changes: 27 additions & 15 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
)
from pandas.core.arrays import Categorical
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
sanitize_array,
)
Expand All @@ -71,7 +72,9 @@
get_objs_combined_axis,
union_indexes,
)
from pandas.core.internals.array_manager import ArrayManager
from pandas.core.internals.managers import (
BlockManager,
create_block_manager_from_arrays,
create_block_manager_from_blocks,
)
Expand All @@ -90,6 +93,7 @@ def arrays_to_mgr(
columns,
dtype: Optional[DtypeObj] = None,
verify_integrity: bool = True,
typ: Optional[str] = None,
):
"""
Segregate Series based on type and coerce into matrices.
Expand All @@ -107,7 +111,8 @@ def arrays_to_mgr(

# don't force copy because getting jammed in an ndarray anyway
arrays = _homogenize(arrays, index, dtype)

if typ == "array":
arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could do this unconditionally. tradeoff: slightly less complex code here, perf of unecessary wrapping/unwrapping in DatetimeBlock/TimedeltaBlock._maybe_coerce_values (for now, until upcoming PR)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would then leave that until it is useful

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you do this inside e.g. in ArrayManager constructor (or add a constructor on L127), prob should do that anyhow.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a constructor on L127

What do you mean exactly with that?
We already use the main constructor ArrayManager(arrays, [index, columns]) on L127 because that constructor already works from "arrays" (in constrast to the BlockManager, which needs to consolidate arrays etc, so therefore there is a helper create_blockmanager_from_arrays, but for ArrayManager this is exactly the __init__ already)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think the suggestion is to move the call on L115 to just before the return ArrayManager on L127. i think thatd be a small improvement, not a big deal

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intentionally put it here because, in theory, I think this wrapping only needs to be done with the default of verify_integrity=True (so you have a way to skip it)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean on line L127 this should call a method inside internal and not directly calling the manager. Then you can handle the special cases. on L110 inside that function.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you want me to write this function?

def create_array_manager_from_arrays(arrays, axes, verify_integrity=True):
    if verify_integrity:
        arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays]
    return ArrayManaget(arrays, axes)

(IMO this just splits the handling of verify_integrity to multiple places (as mentioned above, that's the reason that I put this wrapping where it is located now), but I don't care if this makes you happy)

this should call a method inside internal

Note you are commenting on a file that already is inside internals

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its L110 which needs to be handled in array_manager.py and not here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, moved inside array_manager.py

columns = ensure_index(columns)
else:
columns = ensure_index(columns)
Expand All @@ -116,11 +121,16 @@ def arrays_to_mgr(
# from BlockManager perspective
axes = [columns, index]

return create_block_manager_from_arrays(arrays, arr_names, axes)
if typ == "block":
return create_block_manager_from_arrays(arrays, arr_names, axes)
elif typ == "array":
return ArrayManager(arrays, [index, columns])
else:
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")


def masked_rec_array_to_mgr(
data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool
data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str
):
"""
Extract from a masked rec array and create the manager.
Expand Down Expand Up @@ -154,7 +164,7 @@ def masked_rec_array_to_mgr(
if columns is None:
columns = arr_columns

mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ)

if copy:
mgr = mgr.copy()
Expand All @@ -166,19 +176,14 @@ def mgr_to_mgr(mgr, typ: str):
Convert to specific type of Manager. Does not copy if the type is already
correct. Does not guarantee a copy otherwise.
"""
from pandas.core.internals import (
ArrayManager,
BlockManager,
)

new_mgr: Manager

if typ == "block":
if isinstance(mgr, BlockManager):
new_mgr = mgr
else:
new_mgr = arrays_to_mgr(
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block"
)
elif typ == "array":
if isinstance(mgr, ArrayManager):
Expand All @@ -187,15 +192,17 @@ def mgr_to_mgr(mgr, typ: str):
arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))]
new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
else:
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'")
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
return new_mgr


# ---------------------------------------------------------------------
# DataFrame Constructor Interface


def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
def init_ndarray(
values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str
):
# input must be a ndarray, list, Series, index

if isinstance(values, ABCSeries):
Expand Down Expand Up @@ -224,7 +231,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
if columns is None:
columns = Index(range(len(values)))

return arrays_to_mgr(values, columns, index, columns, dtype=dtype)
return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ)

# by definition an array here
# the dtypes will be coerced to a single dtype
Expand Down Expand Up @@ -277,7 +284,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
return create_block_manager_from_blocks(block_values, [columns, index])


def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
Expand Down Expand Up @@ -321,7 +328,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
arrays = [
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
]
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ)


def nested_data_to_arrays(
Expand Down Expand Up @@ -415,6 +422,11 @@ def _homogenize(data, index: Index, dtype: Optional[DtypeObj]):
# Forces alignment. No need to copy data since we
# are putting it into an ndarray later
val = val.reindex(index, copy=False)
# TODO extract_array should be preferred, but that gives failures for
# `extension/test_numpy.py` (extract_array will convert numpy arrays
# to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021
# val = extract_array(val, extract_numpy=True)
val = val._values
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
else:
if isinstance(val, dict):
if oindex is None:
Expand Down
20 changes: 18 additions & 2 deletions pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytz

from pandas.compat import is_platform_little_endian
import pandas.util._test_decorators as td

from pandas import (
CategoricalIndex,
Expand Down Expand Up @@ -118,6 +119,8 @@ def test_from_records_sequencelike(self):
tm.assert_series_equal(result["C"], df["C"])
tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_sequencelike_empty(self):
# empty case
result = DataFrame.from_records([], columns=["foo", "bar", "baz"])
assert len(result) == 0
Expand Down Expand Up @@ -184,7 +187,12 @@ def test_from_records_bad_index_column(self):
tm.assert_index_equal(df1.index, Index(df.C))

# should fail
msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)"
msg = "|".join(
[
r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)",
"Passed arrays should have the same length as the rows Index: 10 vs 1",
]
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_records(df, index=[2])
with pytest.raises(KeyError, match=r"^2$"):
Expand All @@ -208,6 +216,7 @@ def __iter__(self):
expected = DataFrame.from_records(tups)
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_len0_with_columns(self):
# GH#2633
result = DataFrame.from_records([], index="foo", columns=["foo", "bar"])
Expand Down Expand Up @@ -259,7 +268,12 @@ def test_from_records_to_records(self):
tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2))

# wrong length
msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
msg = "|".join(
[
r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)",
"Passed arrays should have the same length as the rows Index: 2 vs 1",
]
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_records(arr, index=index[:-1])

Expand Down Expand Up @@ -386,6 +400,7 @@ def create_dict(order_id):
result = DataFrame.from_records(documents, index=["order_id", "quantity"])
assert result.index.names == ("order_id", "quantity")

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_misc_brokenness(self):
# GH#2179

Expand Down Expand Up @@ -424,6 +439,7 @@ def test_from_records_misc_brokenness(self):
)
tm.assert_series_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_empty(self):
# GH#3562
result = DataFrame.from_records([], columns=["a", "b", "c"])
Expand Down
Loading