Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ArrayManager] DataFrame constructors #39991

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
61983d8
[ArrayManager] DataFrame constructors
jorisvandenbossche Feb 23, 2021
1d0315f
clean-up signatures
jorisvandenbossche Feb 23, 2021
ffc8314
'fix' for PandasArrays
jorisvandenbossche Feb 23, 2021
46e73c8
tests
jorisvandenbossche Feb 23, 2021
3e108df
ensure datetime-like array
jorisvandenbossche Feb 23, 2021
854bb17
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 24, 2021
8726d42
small clean-up - additional comments
jorisvandenbossche Feb 24, 2021
6e17183
use string join for msg
jorisvandenbossche Feb 24, 2021
8096665
add github issue link to comment
jorisvandenbossche Feb 24, 2021
aef4cc8
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 25, 2021
9c0a3d6
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 25, 2021
1eb5cb7
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 26, 2021
0992e67
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 26, 2021
936b290
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Mar 1, 2021
54d36ab
move wrapping inside ArrayManager constructor
jorisvandenbossche Mar 1, 2021
c56ffa8
remove skip
jorisvandenbossche Mar 1, 2021
164387c
trigger ci
jorisvandenbossche Mar 1, 2021
143b572
add skip for rename copy
jorisvandenbossche Mar 1, 2021
6166927
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Mar 2, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ jobs:
run: |
source activate pandas-dev
pytest pandas/tests/frame/methods --array-manager
pytest pandas/tests/frame/test_constructors.py --array-manager
pytest pandas/tests/frame/constructors/ --array-manager

pytest pandas/tests/arithmetic/ --array-manager
pytest pandas/tests/reshape/merge --array-manager

Expand Down
60 changes: 42 additions & 18 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,41 +563,57 @@ def __init__(
if isinstance(data, DataFrame):
data = data._mgr

if isinstance(data, (BlockManager, ArrayManager)):
if index is None and columns is None and dtype is None and copy is False:
# GH#33357 fastpath
NDFrame.__init__(self, data)
return
if (
index is None
and columns is None
and dtype is None
and copy is False
and isinstance(data, (BlockManager, ArrayManager))
jreback marked this conversation as resolved.
Show resolved Hide resolved
):
# GH#33357 fastpath
NDFrame.__init__(self, data)
return

manager = get_option("mode.data_manager")

if isinstance(data, (BlockManager, ArrayManager)):
jreback marked this conversation as resolved.
Show resolved Hide resolved
mgr = self._init_mgr(
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
)

elif isinstance(data, dict):
mgr = init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(data, index, columns, dtype=dtype, typ=manager)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you rename typ -> manager?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note this is the same keyword as I use for arrays_to_mgr, mgr_to_mgr, etc, where I think the typ keyword makes sense (it's already clear from the name of the function that it is creating a manager).

Alternatively, we could make the init_dict function name more explicit (and consistent with the others) and eg rename to dict_to_mgr. I am happy to do this in a separate PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok yeah +1 on the rename (followon ok ) and anything to make the keyword more obvious

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-> #40074

elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords

# masked recarray
if isinstance(data, mrecords.MaskedRecords):
mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
mgr = masked_rec_array_to_mgr(
data, index, columns, dtype, copy, typ=manager
)

# a masked array
else:
data = sanitize_masked_array(data)
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)

elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
mgr = init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(data, index, columns, dtype=dtype, typ=manager)
elif getattr(data, "name", None) is not None:
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
mgr = init_dict(
{data.name: data}, index, columns, dtype=dtype, typ=manager
)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)

# For data is list-like, or Iterable (will consume into list)
elif is_list_like(data):
Expand All @@ -610,11 +626,15 @@ def __init__(
arrays, columns, index = nested_data_to_arrays(
data, columns, index, dtype
)
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
mgr = arrays_to_mgr(
arrays, columns, index, columns, dtype=dtype, typ=manager
)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)
else:
mgr = init_dict({}, index, columns, dtype=dtype)
mgr = init_dict({}, index, columns, dtype=dtype, typ=manager)
# For data is scalar
else:
if index is None or columns is None:
Expand All @@ -631,18 +651,19 @@ def __init__(
construct_1d_arraylike_from_scalar(data, len(index), dtype)
for _ in range(len(columns))
]
mgr = arrays_to_mgr(values, columns, index, columns, dtype=None)
mgr = arrays_to_mgr(
values, columns, index, columns, dtype=None, typ=manager
)
else:
values = construct_2d_arraylike_from_scalar(
data, len(index), len(columns), dtype, copy
)

mgr = init_ndarray(
values, index, columns, dtype=values.dtype, copy=False
values, index, columns, dtype=values.dtype, copy=False, typ=manager
)

# ensure correct Manager type according to settings
manager = get_option("mode.data_manager")
mgr = mgr_to_mgr(mgr, typ=manager)

NDFrame.__init__(self, mgr)
Expand Down Expand Up @@ -1970,7 +1991,8 @@ def from_records(
arr_columns = arr_columns.drop(arr_exclude)
columns = columns.drop(exclude)

mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
manager = get_option("mode.data_manager")
mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager)

return cls(mgr)

Expand Down Expand Up @@ -2177,13 +2199,15 @@ def _from_arrays(
if dtype is not None:
dtype = pandas_dtype(dtype)

manager = get_option("mode.data_manager")
mgr = arrays_to_mgr(
arrays,
columns,
index,
columns,
dtype=dtype,
verify_integrity=verify_integrity,
typ=manager,
)
return cls(mgr)

Expand Down
2 changes: 2 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@
ArrayManager,
BlockManager,
)
from pandas.core.internals.construction import mgr_to_mgr
from pandas.core.missing import find_valid_index
from pandas.core.ops import align_method_FRAME
from pandas.core.reshape.concat import concat
Expand Down Expand Up @@ -5752,6 +5753,7 @@ def _to_dict_of_blocks(self, copy: bool_t = True):
Internal ONLY - only works for BlockManager
"""
mgr = self._mgr
mgr = mgr_to_mgr(mgr, "block")
mgr = cast(BlockManager, mgr)
return {
k: self._constructor(v).__finalize__(self)
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False

value = extract_array(value, extract_numpy=True)
if value.ndim == 2:
value = value[0, :]
if value.shape[0] == 1:
jreback marked this conversation as resolved.
Show resolved Hide resolved
value = value[0, :]
else:
raise ValueError(
f"expected 1D array, got array with shape {value.shape}"
)

# TODO self.arrays can be empty
# assert len(value) == len(self.arrays[0])

Expand Down
37 changes: 23 additions & 14 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@
get_objs_combined_axis,
union_indexes,
)
from pandas.core.internals.array_manager import ArrayManager
from pandas.core.internals.managers import (
BlockManager,
create_block_manager_from_arrays,
create_block_manager_from_blocks,
)
Expand All @@ -90,6 +92,7 @@ def arrays_to_mgr(
columns,
dtype: Optional[DtypeObj] = None,
verify_integrity: bool = True,
typ: Optional[str] = None,
):
"""
Segregate Series based on type and coerce into matrices.
Expand All @@ -116,11 +119,16 @@ def arrays_to_mgr(
# from BlockManager perspective
axes = [columns, index]

return create_block_manager_from_arrays(arrays, arr_names, axes)
if typ == "block":
return create_block_manager_from_arrays(arrays, arr_names, axes)
elif typ == "array":
return ArrayManager(arrays, [index, columns])
else:
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")


def masked_rec_array_to_mgr(
data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool
data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str
):
"""
Extract from a masked rec array and create the manager.
Expand Down Expand Up @@ -154,7 +162,7 @@ def masked_rec_array_to_mgr(
if columns is None:
columns = arr_columns

mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ)

if copy:
mgr = mgr.copy()
Expand All @@ -166,19 +174,14 @@ def mgr_to_mgr(mgr, typ: str):
Convert to specific type of Manager. Does not copy if the type is already
correct. Does not guarantee a copy otherwise.
"""
from pandas.core.internals import (
ArrayManager,
BlockManager,
)

new_mgr: Manager

if typ == "block":
if isinstance(mgr, BlockManager):
new_mgr = mgr
else:
new_mgr = arrays_to_mgr(
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block"
)
elif typ == "array":
if isinstance(mgr, ArrayManager):
Expand All @@ -187,15 +190,17 @@ def mgr_to_mgr(mgr, typ: str):
arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))]
new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
else:
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'")
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
return new_mgr


# ---------------------------------------------------------------------
# DataFrame Constructor Interface


def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
def init_ndarray(
values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str
):
# input must be a ndarray, list, Series, index

if isinstance(values, ABCSeries):
Expand Down Expand Up @@ -224,7 +229,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
if columns is None:
columns = Index(range(len(values)))

return arrays_to_mgr(values, columns, index, columns, dtype=dtype)
return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ)

# by definition an array here
# the dtypes will be coerced to a single dtype
Expand Down Expand Up @@ -277,7 +282,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
return create_block_manager_from_blocks(block_values, [columns, index])


def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
Expand Down Expand Up @@ -321,7 +326,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
arrays = [
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
]
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ)


def nested_data_to_arrays(
Expand Down Expand Up @@ -415,6 +420,10 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]):
# Forces alignment. No need to copy data since we
# are putting it into an ndarray later
val = val.reindex(index, copy=False)
# extract_array should be preferred? But that gives failures for
# `extension/test_numpy.py`
# val = extract_array(val, extract_numpy=True)
val = val._values
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
else:
if isinstance(val, dict):
if oindex is None:
Expand Down
16 changes: 14 additions & 2 deletions pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytz

from pandas.compat import is_platform_little_endian
import pandas.util._test_decorators as td

from pandas import (
CategoricalIndex,
Expand Down Expand Up @@ -118,6 +119,8 @@ def test_from_records_sequencelike(self):
tm.assert_series_equal(result["C"], df["C"])
tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_sequencelike_empty(self):
# empty case
result = DataFrame.from_records([], columns=["foo", "bar", "baz"])
assert len(result) == 0
Expand Down Expand Up @@ -184,7 +187,10 @@ def test_from_records_bad_index_column(self):
tm.assert_index_equal(df1.index, Index(df.C))

# should fail
msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)"
msg = (
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)|"
"Passed arrays should have the same length as the rows Index: 10 vs 1 rows"
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_records(df, index=[2])
with pytest.raises(KeyError, match=r"^2$"):
Expand All @@ -208,6 +214,7 @@ def __iter__(self):
expected = DataFrame.from_records(tups)
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_len0_with_columns(self):
# GH#2633
result = DataFrame.from_records([], index="foo", columns=["foo", "bar"])
Expand Down Expand Up @@ -259,7 +266,10 @@ def test_from_records_to_records(self):
tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2))

# wrong length
msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
msg = (
r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)|"
"Passed arrays should have the same length as the rows Index: 2 vs 1 rows"
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_records(arr, index=index[:-1])

Expand Down Expand Up @@ -386,6 +396,7 @@ def create_dict(order_id):
result = DataFrame.from_records(documents, index=["order_id", "quantity"])
assert result.index.names == ("order_id", "quantity")

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_misc_brokenness(self):
# GH#2179

Expand Down Expand Up @@ -424,6 +435,7 @@ def test_from_records_misc_brokenness(self):
)
tm.assert_series_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_empty(self):
# GH#3562
result = DataFrame.from_records([], columns=["a", "b", "c"])
Expand Down
Loading