diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 56ff8c1fc7c9b..0639e4a7bb5e4 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -1273,7 +1273,7 @@ columns: .. ipython:: python - result = pd.merge(left, right, on='k', suffixes=['_l', '_r']) + result = pd.merge(left, right, on='k', suffixes=('_l', '_r')) .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2243790a663df..197ffdc2ccef0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -401,6 +401,7 @@ Backwards incompatible API changes - :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) +- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0c796c8f45a52..5e4eb89f0b45f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -667,10 +667,8 @@ def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, lsuf, self.right._info_axis, rsuf + self.left._info_axis, self.right._info_axis, self.suffixes ) lindexers = {1: left_indexer} if left_indexer is not None else {} @@ -1484,10 +1482,8 @@ def __init__( def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, lsuf, self.right._info_axis, rsuf + self.left._info_axis, self.right._info_axis, self.suffixes ) if self.fill_method == "ffill": @@ -2067,17 +2063,26 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame": ) -def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): +def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, str]): """ + Suffixes type validation. + If two indices overlap, add suffixes to overlapping entries. If corresponding suffix is empty, the entry is simply converted to string. """ + if not isinstance(suffixes, tuple): + raise TypeError( + f"suffixes should be tuple of (str, str). But got {type(suffixes).__name__}" + ) + to_rename = left.intersection(right) if len(to_rename) == 0: return left, right + lsuffix, rsuffix = suffixes + if not lsuffix and not rsuffix: raise ValueError(f"columns overlap but no suffix specified: {to_rename}") diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index dc1efa46403be..c33443e24b268 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -162,7 +162,7 @@ def test_inner_join(self): _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") def test_handle_overlap(self): - joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"]) + joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar")) assert "key1.foo" in joined assert "key1.bar" in joined @@ -173,7 +173,7 @@ def test_handle_overlap_arbitrary_key(self): self.df2, left_on="key2", right_on="key1", - suffixes=[".foo", ".bar"], + suffixes=(".foo", ".bar"), ) assert "key1.foo" in joined assert "key2.bar" in joined diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4408aa0bbce4a..0a4d5f17a48cc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2004,8 +2004,8 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), - ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), - (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), + ("a", "a", dict(suffixes=(None, "_x")), ["a", "a_x"]), + (0, 0, dict(suffixes=("_a", None)), ["0_a", 0]), ("a", "a", dict(), ["a_x", "a_y"]), (0, 0, dict(), ["0_x", "0_y"]), ], @@ -2056,13 +2056,7 @@ def test_merge_duplicate_suffix(how, expected): @pytest.mark.parametrize( "col1, col2, suffixes", - [ - ("a", "a", [None, None]), - ("a", "a", (None, None)), - ("a", "a", ("", None)), - (0, 0, [None, None]), - (0, 0, (None, "")), - ], + [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))], ) def test_merge_suffix_error(col1, col2, suffixes): # issue: 24782 @@ -2075,18 +2069,35 @@ def test_merge_suffix_error(col1, col2, suffixes): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) -@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)]) -def test_merge_suffix_none_error(col1, col2, suffixes): - # issue: 24782 +@pytest.mark.parametrize( + "col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)], +) +def test_merge_suffix_type_error(col1, col2, suffixes): a = pd.DataFrame({col1: [1, 2, 3]}) b = pd.DataFrame({col2: [3, 4, 5]}) - # TODO: might reconsider current raise behaviour, see GH24782 - msg = "iterable" + msg = ( + f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}" + ) with pytest.raises(TypeError, match=msg): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) +@pytest.mark.parametrize( + "col1, col2, suffixes, msg", + [ + ("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"), + ("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"), + ], +) +def test_merge_suffix_length_error(col1, col2, suffixes, msg): + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [3, 4, 5]}) + + with pytest.raises(ValueError, match=msg): + pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + + @pytest.mark.parametrize("cat_dtype", ["one", "two"]) @pytest.mark.parametrize("reverse", [True, False]) def test_merge_equal_cat_dtypes(cat_dtype, reverse):