From a27b04b476589743bcf959db2b07d17622ee91e8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jun 2020 16:00:32 -0500 Subject: [PATCH 1/5] API: Allow non-tuples in pandas.merge Closes https://github.com/pandas-dev/pandas/issues/34741, while retaining the spirit of the spirit of https://github.com/pandas-dev/pandas/pull/34208. --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/reshape/merge.py | 10 +++++++--- pandas/tests/reshape/merge/test_merge.py | 16 +++++----------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0c746b197c5b8..eb9019d7fba90 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -746,6 +746,7 @@ Deprecations - :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) +- Providing ``suffixes`` as a ``set`` in :func:`pandas.merge` is deprecated. Provide a tuple instead (:issue:`33740`, `34741`). - :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) - Passing any arguments but the first one to :func:`read_html` as diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5e4eb89f0b45f..e23221751dc86 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2072,9 +2072,13 @@ def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, s If corresponding suffix is empty, the entry is simply converted to string. """ - if not isinstance(suffixes, tuple): - raise TypeError( - f"suffixes should be tuple of (str, str). But got {type(suffixes).__name__}" + if isinstance(suffixes, set): + warnings.warn( + "Passing 'suffixes' as a set, which is unordered, may result in " + "unexpected results. Provide 'suffixes' as a tuple instead. In the " + "future a 'TypeError' will be raised.", + FutureWarning, + stacklevel=4, ) to_rename = left.intersection(right) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0a4d5f17a48cc..4e25095b7c770 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2069,18 +2069,12 @@ def test_merge_suffix_error(col1, col2, suffixes): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) -@pytest.mark.parametrize( - "col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)], -) -def test_merge_suffix_type_error(col1, col2, suffixes): - a = pd.DataFrame({col1: [1, 2, 3]}) - b = pd.DataFrame({col2: [3, 4, 5]}) +def test_merge_suffix_set(): + a = pd.DataFrame({"a": [1, 2, 3]}) + b = pd.DataFrame({"b": [3, 4, 5]}) - msg = ( - f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}" - ) - with pytest.raises(TypeError, match=msg): - pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + with tm.assert_produces_warning(FutureWarning): + pd.merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"}) @pytest.mark.parametrize( From a50771425150f495a87c74f18ae4f855f5d98b4d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jun 2020 16:04:06 -0500 Subject: [PATCH 2/5] update whatsnew --- doc/source/whatsnew/v1.1.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index eb9019d7fba90..fefe3f92b4b97 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -404,7 +404,6 @@ Backwards incompatible API changes - :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) - :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) - :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) @@ -746,7 +745,7 @@ Deprecations - :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) -- Providing ``suffixes`` as a ``set`` in :func:`pandas.merge` is deprecated. Provide a tuple instead (:issue:`33740`, `34741`). +- Providing ``suffixes`` as a ``set`` in :func:`pandas.merge` is deprecated. Provide a tuple instead (:issue:`33740`, :issue:`34741`). - :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) - Passing any arguments but the first one to :func:`read_html` as From 10b94caf90a6edac817d941ffdcc0c8614529949 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Jun 2020 13:15:11 -0500 Subject: [PATCH 3/5] wip --- doc/source/whatsnew/v1.1.0.rst | 1 - pandas/core/frame.py | 11 +++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5a58ae2e7f246..4b0d53c38a94a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -658,7 +658,6 @@ Other API changes - :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) - :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) - :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 68c06715e1ea4..af4e85bd29849 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -226,10 +226,13 @@ sort : bool, default False Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword). -suffixes : tuple of (str, str), default ('_x', '_y') - Suffix to apply to overlapping column names in the left and right - side, respectively. To raise an exception on overlapping columns use - (False, False). +suffixes : Sequence, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. copy : bool, default True If False, avoid copy if possible. indicator : bool or str, default False From baf2ee45c5b91ae77a54577d1ba6e510a0dcb984 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Jun 2020 14:46:15 -0500 Subject: [PATCH 4/5] add suffix --- pandas/tests/reshape/merge/test_merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4e25095b7c770..422244b698d58 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1999,6 +1999,7 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), + (0, 0, dict(suffixes=["_x", "_y"]), ["0_x", "0_y"]), ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), From b96df6fa9be7ec922032e3eedbc503255aef3381 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 13:09:02 -0500 Subject: [PATCH 5/5] update --- pandas/core/frame.py | 2 +- pandas/core/reshape/merge.py | 6 +++--- pandas/tests/reshape/merge/test_merge.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6d3a2661f517a..b6993e9ed851a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -227,7 +227,7 @@ sort : bool, default False Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword). -suffixes : Sequence, default is ("_x", "_y") +suffixes : list-like, default is ("_x", "_y") A length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in `left` and `right` respectively. Pass a value of `None` instead diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e23221751dc86..27b331babe692 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -194,7 +194,7 @@ def merge_ordered( left DataFrame. fill_method : {'ffill', None}, default None Interpolation method for data. - suffixes : Sequence, default is ("_x", "_y") + suffixes : list-like, default is ("_x", "_y") A length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in `left` and `right` respectively. Pass a value of `None` instead @@ -2072,9 +2072,9 @@ def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, s If corresponding suffix is empty, the entry is simply converted to string. """ - if isinstance(suffixes, set): + if not is_list_like(suffixes, allow_sets=False): warnings.warn( - "Passing 'suffixes' as a set, which is unordered, may result in " + f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " "unexpected results. Provide 'suffixes' as a tuple instead. In the " "future a 'TypeError' will be raised.", FutureWarning, diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 422244b698d58..4fd3c688b8771 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2070,7 +2070,8 @@ def test_merge_suffix_error(col1, col2, suffixes): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) -def test_merge_suffix_set(): +@pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}]) +def test_merge_suffix_warns(suffixes): a = pd.DataFrame({"a": [1, 2, 3]}) b = pd.DataFrame({"b": [3, 4, 5]})