Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restrict Pandas merge suffixes param type to list/tuple to avoid interchange in right and left suffix order #34208

Merged
merged 10 commits into from
Jun 8, 2020
2 changes: 1 addition & 1 deletion doc/source/user_guide/merging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1273,7 +1273,7 @@ columns:

.. ipython:: python

result = pd.merge(left, right, on='k', suffixes=['_l', '_r'])
result = pd.merge(left, right, on='k', suffixes=('_l', '_r'))

.. ipython:: python
:suppress:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ Backwards incompatible API changes
- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string.
- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError``
(previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`)
- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`)

``MultiIndex.get_indexer`` interprets `method` argument differently
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
19 changes: 12 additions & 7 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,10 +667,8 @@ def get_result(self):

join_index, left_indexer, right_indexer = self._get_join_info()

lsuf, rsuf = self.suffixes

llabels, rlabels = _items_overlap_with_suffix(
self.left._info_axis, lsuf, self.right._info_axis, rsuf
self.left._info_axis, self.right._info_axis, self.suffixes
)

lindexers = {1: left_indexer} if left_indexer is not None else {}
Expand Down Expand Up @@ -1484,10 +1482,8 @@ def __init__(
def get_result(self):
join_index, left_indexer, right_indexer = self._get_join_info()

lsuf, rsuf = self.suffixes

llabels, rlabels = _items_overlap_with_suffix(
self.left._info_axis, lsuf, self.right._info_axis, rsuf
self.left._info_axis, self.right._info_axis, self.suffixes
)

if self.fill_method == "ffill":
Expand Down Expand Up @@ -2067,17 +2063,26 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame":
)


def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix):
def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, str]):
"""
Suffixes type validation.

If two indices overlap, add suffixes to overlapping entries.

If corresponding suffix is empty, the entry is simply converted to string.

"""
if not isinstance(suffixes, tuple):
raise TypeError(
f"suffixes should be tuple of (str, str). But got {type(suffixes).__name__}"
)

to_rename = left.intersection(right)
if len(to_rename) == 0:
return left, right

lsuffix, rsuffix = suffixes

if not lsuffix and not rsuffix:
raise ValueError(f"columns overlap but no suffix specified: {to_rename}")

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def test_inner_join(self):
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner")

def test_handle_overlap(self):
joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"])
joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar"))

assert "key1.foo" in joined
assert "key1.bar" in joined
Expand All @@ -173,7 +173,7 @@ def test_handle_overlap_arbitrary_key(self):
self.df2,
left_on="key2",
right_on="key1",
suffixes=[".foo", ".bar"],
suffixes=(".foo", ".bar"),
)
assert "key1.foo" in joined
assert "key2.bar" in joined
Expand Down
39 changes: 25 additions & 14 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2004,8 +2004,8 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]),
("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]),
("a", "b", dict(suffixes=("_x", None)), ["a", "b"]),
("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]),
(0, 0, dict(suffixes=["_a", None]), ["0_a", 0]),
("a", "a", dict(suffixes=(None, "_x")), ["a", "a_x"]),
(0, 0, dict(suffixes=("_a", None)), ["0_a", 0]),
("a", "a", dict(), ["a_x", "a_y"]),
(0, 0, dict(), ["0_x", "0_y"]),
],
Expand Down Expand Up @@ -2056,13 +2056,7 @@ def test_merge_duplicate_suffix(how, expected):

@pytest.mark.parametrize(
"col1, col2, suffixes",
[
("a", "a", [None, None]),
("a", "a", (None, None)),
("a", "a", ("", None)),
(0, 0, [None, None]),
(0, 0, (None, "")),
],
[("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))],
)
def test_merge_suffix_error(col1, col2, suffixes):
# issue: 24782
Expand All @@ -2075,18 +2069,35 @@ def test_merge_suffix_error(col1, col2, suffixes):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)])
def test_merge_suffix_none_error(col1, col2, suffixes):
# issue: 24782
@pytest.mark.parametrize(
"col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)],
)
def test_merge_suffix_type_error(col1, col2, suffixes):
a = pd.DataFrame({col1: [1, 2, 3]})
b = pd.DataFrame({col2: [3, 4, 5]})

# TODO: might reconsider current raise behaviour, see GH24782
msg = "iterable"
msg = (
f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}"
)
with pytest.raises(TypeError, match=msg):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize(
"col1, col2, suffixes, msg",
[
("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"),
("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"),
],
)
def test_merge_suffix_length_error(col1, col2, suffixes, msg):
a = pd.DataFrame({col1: [1, 2, 3]})
b = pd.DataFrame({col2: [3, 4, 5]})

with pytest.raises(ValueError, match=msg):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize("cat_dtype", ["one", "two"])
@pytest.mark.parametrize("reverse", [True, False])
def test_merge_equal_cat_dtypes(cat_dtype, reverse):
Expand Down