From 0ee16112787739ec9bcddedc1c49bf23be0bcb50 Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Sat, 16 May 2020 00:10:33 +0530 Subject: [PATCH 01/10] Add type check for suffixes field in pd.merge to restrict types to tuple and list --- pandas/core/reshape/merge.py | 19 ++++++++++++------- pandas/tests/reshape/merge/test_merge.py | 22 +++++++++++++++++----- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0c796c8f45a52..c6f44660146a8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -667,10 +667,8 @@ def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, lsuf, self.right._info_axis, rsuf + self.left._info_axis, self.right._info_axis, self.suffixes ) lindexers = {1: left_indexer} if left_indexer is not None else {} @@ -1484,10 +1482,8 @@ def __init__( def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, lsuf, self.right._info_axis, rsuf + self.left._info_axis, self.right._info_axis, self.suffixes ) if self.fill_method == "ffill": @@ -2067,17 +2063,26 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame": ) -def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): +def _items_overlap_with_suffix(left: Index, right: Index, suffixes): """ + Suffixes type validatoin. + If two indices overlap, add suffixes to overlapping entries. If corresponding suffix is empty, the entry is simply converted to string. """ + if not isinstance(suffixes, (tuple, list)): + raise TypeError( + f"suffixes should be of type list/tuple. But got {type(suffixes)}" + ) + to_rename = left.intersection(right) if len(to_rename) == 0: return left, right + lsuffix, rsuffix = suffixes + if not lsuffix and not rsuffix: raise ValueError(f"columns overlap but no suffix specified: {to_rename}") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4408aa0bbce4a..b043c2e57e6e2 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2075,18 +2075,30 @@ def test_merge_suffix_error(col1, col2, suffixes): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) -@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)]) -def test_merge_suffix_none_error(col1, col2, suffixes): - # issue: 24782 +@pytest.mark.parametrize( + "col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)], +) +def test_merge_suffix_type_error(col1, col2, suffixes): a = pd.DataFrame({col1: [1, 2, 3]}) b = pd.DataFrame({col2: [3, 4, 5]}) - # TODO: might reconsider current raise behaviour, see GH24782 - msg = "iterable" + msg = f"suffixes should be of type list/tuple. But got {type(suffixes)}" with pytest.raises(TypeError, match=msg): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) +@pytest.mark.parametrize( + "col1, col2, suffixes", [("a", "a", ("a", "b", "c"))], +) +def test_merge_suffix_length_error(col1, col2, suffixes): + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [3, 4, 5]}) + + msg = r"too many values to unpack \(expected 2\)" + with pytest.raises(ValueError, match=msg): + pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + + @pytest.mark.parametrize("cat_dtype", ["one", "two"]) @pytest.mark.parametrize("reverse", [True, False]) def test_merge_equal_cat_dtypes(cat_dtype, reverse): From 07d111094993f0ba7d8d8f216c1912e150fe3b5f Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Sat, 16 May 2020 18:52:24 +0530 Subject: [PATCH 02/10] Add test to invalidate lenght one suffix parameter --- pandas/tests/reshape/merge/test_merge.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b043c2e57e6e2..6d7611c134ac3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2088,13 +2088,16 @@ def test_merge_suffix_type_error(col1, col2, suffixes): @pytest.mark.parametrize( - "col1, col2, suffixes", [("a", "a", ("a", "b", "c"))], + "col1, col2, suffixes, msg", + [ + ("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"), + ("a", "a", ("a"), r"not enough values to unpack \(expected 2, got 1\)"), + ], ) -def test_merge_suffix_length_error(col1, col2, suffixes): +def test_merge_suffix_length_error(col1, col2, suffixes, msg): a = pd.DataFrame({col1: [1, 2, 3]}) b = pd.DataFrame({col2: [3, 4, 5]}) - msg = r"too many values to unpack \(expected 2\)" with pytest.raises(ValueError, match=msg): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) From 21eaf2f2c825ff81cdda10e309c6edf102f6d103 Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Sat, 16 May 2020 18:54:46 +0530 Subject: [PATCH 03/10] Update suffixes pamaeter doc in merge method --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f8ab8966c1f0..07651460dab17 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -226,7 +226,7 @@ sort : bool, default False Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword). -suffixes : tuple of (str, str), default ('_x', '_y') +suffixes : list/tuple of (str, str), default ('_x', '_y') Suffix to apply to overlapping column names in the left and right side, respectively. To raise an exception on overlapping columns use (False, False). From 283335baeac581d8b934e0ab6b09b8ab32f12e82 Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Sat, 16 May 2020 19:55:20 +0530 Subject: [PATCH 04/10] Fix test by changing to list of single item --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 6d7611c134ac3..929a73a84a491 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2091,7 +2091,7 @@ def test_merge_suffix_type_error(col1, col2, suffixes): "col1, col2, suffixes, msg", [ ("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"), - ("a", "a", ("a"), r"not enough values to unpack \(expected 2, got 1\)"), + ("a", "a", ["a"], r"not enough values to unpack \(expected 2, got 1\)"), ], ) def test_merge_suffix_length_error(col1, col2, suffixes, msg): From a3a97c1219d34fdf71d9964ff908bdfb64074666 Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Tue, 19 May 2020 20:59:55 +0530 Subject: [PATCH 05/10] Change suffixes field to accept only tuple of len 2 --- pandas/core/frame.py | 2 +- pandas/core/reshape/merge.py | 6 +++--- pandas/tests/reshape/merge/test_merge.py | 10 ++++------ 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 07651460dab17..5f8ab8966c1f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -226,7 +226,7 @@ sort : bool, default False Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword). -suffixes : list/tuple of (str, str), default ('_x', '_y') +suffixes : tuple of (str, str), default ('_x', '_y') Suffix to apply to overlapping column names in the left and right side, respectively. To raise an exception on overlapping columns use (False, False). diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c6f44660146a8..857fb43ff5fa3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2065,16 +2065,16 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame": def _items_overlap_with_suffix(left: Index, right: Index, suffixes): """ - Suffixes type validatoin. + Suffixes type validation. If two indices overlap, add suffixes to overlapping entries. If corresponding suffix is empty, the entry is simply converted to string. """ - if not isinstance(suffixes, (tuple, list)): + if not isinstance(suffixes, tuple): raise TypeError( - f"suffixes should be of type list/tuple. But got {type(suffixes)}" + f"suffixes should be tuple of (str, str). But got {type(suffixes)}" ) to_rename = left.intersection(right) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 929a73a84a491..b03eedf2bb639 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2004,8 +2004,8 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), - ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), - (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), + ("a", "a", dict(suffixes=(None, "_x")), ["a", "a_x"]), + (0, 0, dict(suffixes=("_a", None)), ["0_a", 0]), ("a", "a", dict(), ["a_x", "a_y"]), (0, 0, dict(), ["0_x", "0_y"]), ], @@ -2057,10 +2057,8 @@ def test_merge_duplicate_suffix(how, expected): @pytest.mark.parametrize( "col1, col2, suffixes", [ - ("a", "a", [None, None]), ("a", "a", (None, None)), ("a", "a", ("", None)), - (0, 0, [None, None]), (0, 0, (None, "")), ], ) @@ -2082,7 +2080,7 @@ def test_merge_suffix_type_error(col1, col2, suffixes): a = pd.DataFrame({col1: [1, 2, 3]}) b = pd.DataFrame({col2: [3, 4, 5]}) - msg = f"suffixes should be of type list/tuple. But got {type(suffixes)}" + msg = f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes)}" with pytest.raises(TypeError, match=msg): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) @@ -2091,7 +2089,7 @@ def test_merge_suffix_type_error(col1, col2, suffixes): "col1, col2, suffixes, msg", [ ("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"), - ("a", "a", ["a"], r"not enough values to unpack \(expected 2, got 1\)"), + ("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"), ], ) def test_merge_suffix_length_error(col1, col2, suffixes, msg): From 44208535d919add42f49d074bbdbc43765137367 Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Tue, 19 May 2020 22:28:13 +0530 Subject: [PATCH 06/10] Change suffixes field from list to tuple in test_join file --- pandas/tests/reshape/merge/test_join.py | 4 ++-- pandas/tests/reshape/merge/test_merge.py | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index dc1efa46403be..c33443e24b268 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -162,7 +162,7 @@ def test_inner_join(self): _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") def test_handle_overlap(self): - joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"]) + joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar")) assert "key1.foo" in joined assert "key1.bar" in joined @@ -173,7 +173,7 @@ def test_handle_overlap_arbitrary_key(self): self.df2, left_on="key2", right_on="key1", - suffixes=[".foo", ".bar"], + suffixes=(".foo", ".bar"), ) assert "key1.foo" in joined assert "key2.bar" in joined diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b03eedf2bb639..76e63d345ac35 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2056,11 +2056,7 @@ def test_merge_duplicate_suffix(how, expected): @pytest.mark.parametrize( "col1, col2, suffixes", - [ - ("a", "a", (None, None)), - ("a", "a", ("", None)), - (0, 0, (None, "")), - ], + [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, "")),], ) def test_merge_suffix_error(col1, col2, suffixes): # issue: 24782 From 05f2e653fb33799101831a3212d37970f4d3503a Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Tue, 19 May 2020 22:31:00 +0530 Subject: [PATCH 07/10] Add typing Tuple format to suffixes field Co-authored-by: Simon Hawkins --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 857fb43ff5fa3..200a33d8959f5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2063,7 +2063,7 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame": ) -def _items_overlap_with_suffix(left: Index, right: Index, suffixes): +def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, str]): """ Suffixes type validation. From 1a9c96051c66a54c77844e5d378587184ee9b228 Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Fri, 22 May 2020 12:16:07 +0530 Subject: [PATCH 08/10] Fix pep8 error and incorporate error message suggestion --- pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 200a33d8959f5..5e4eb89f0b45f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2074,7 +2074,7 @@ def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, s """ if not isinstance(suffixes, tuple): raise TypeError( - f"suffixes should be tuple of (str, str). But got {type(suffixes)}" + f"suffixes should be tuple of (str, str). But got {type(suffixes).__name__}" ) to_rename = left.intersection(right) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 76e63d345ac35..0a4d5f17a48cc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2056,7 +2056,7 @@ def test_merge_duplicate_suffix(how, expected): @pytest.mark.parametrize( "col1, col2, suffixes", - [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, "")),], + [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))], ) def test_merge_suffix_error(col1, col2, suffixes): # issue: 24782 @@ -2076,7 +2076,9 @@ def test_merge_suffix_type_error(col1, col2, suffixes): a = pd.DataFrame({col1: [1, 2, 3]}) b = pd.DataFrame({col2: [3, 4, 5]}) - msg = f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes)}" + msg = ( + f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}" + ) with pytest.raises(TypeError, match=msg): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) From ec906623cc9f31c2fb59c994eb96cff948e31c98 Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Fri, 22 May 2020 19:48:12 +0530 Subject: [PATCH 09/10] Fix suffix type in documentation also --- doc/source/user_guide/merging.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 56ff8c1fc7c9b..0639e4a7bb5e4 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -1273,7 +1273,7 @@ columns: .. ipython:: python - result = pd.merge(left, right, on='k', suffixes=['_l', '_r']) + result = pd.merge(left, right, on='k', suffixes=('_l', '_r')) .. ipython:: python :suppress: From b06005924cb26ba001fc491489284ff9ea8c8f7a Mon Sep 17 00:00:00 2001 From: Puneetha Pai Date: Sat, 23 May 2020 10:15:33 +0530 Subject: [PATCH 10/10] Update backward incompatible section in v1.1.0 relase --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2243790a663df..197ffdc2ccef0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -401,6 +401,7 @@ Backwards incompatible API changes - :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) +- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^