From 5a0b8f1e2825cd1817142042a38d68ddc0130f87 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sat, 18 Nov 2017 00:45:56 +0000 Subject: [PATCH 1/8] BUG: Keep float dtype in merge on int and float column --- pandas/core/reshape/merge.py | 9 ++++++++- pandas/tests/reshape/test_merge.py | 25 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 412c00dc95ec0..0062427e1f77d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -906,13 +906,20 @@ def _maybe_coerce_merge_keys(self): continue # if we are numeric, then allow differing - # kinds to proceed, eg. int64 and int8 + # kinds to proceed, eg. int64 and int8, int and float # further if we are object, but we infer to # the same, then proceed if is_numeric_dtype(lk) and is_numeric_dtype(rk): if lk.dtype.kind == rk.dtype.kind: continue + # check whether ints and floats + if is_integer_dtype(rk) and is_float_dtype(lk): + continue + + if is_float_dtype(rk) and is_integer_dtype(lk): + continue + # let's infer and see if we are ok if lib.infer_dtype(lk) == lib.infer_dtype(rk): continue diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 172667c9a0fb8..265442af4dfa5 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -13,7 +13,11 @@ from pandas.core.reshape.merge import merge, MergeError from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_object_dtype, + is_float_dtype, +) from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -1408,6 +1412,25 @@ def test_join_multi_dtypes(self, d1, d2): expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('int_vals, float_vals', [ + ([1, 2, 3], [1.0, 2.0, 3.0]), + ([1, 2, 3], [1.0, 3.0]), + ([1, 2], [1.0, 2.0, 3.0]), + ([1, 2, 3], [1.1, 2.5, 3.0]), + ]) + def test_merge_on_ints_floats(self, int_vals, float_vals): + # GH 16572 + # Check that float column is not cast to object if + # merging on float and int + A = DataFrame({'X': int_vals}) + B = DataFrame({'Y': float_vals}) + + res = A.merge(B, left_on='X', right_on='Y') + assert is_float_dtype(res['Y'].dtype) + + res = B.merge(A, left_on='Y', right_on='X') + assert is_float_dtype(res['Y'].dtype) + @pytest.fixture def left(): From 039eeaef4c107cab9f860accb178c1c1dcd2b2d4 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Mon, 20 Nov 2017 22:42:22 +0000 Subject: [PATCH 2/8] check if float values equal to int representation and add warning --- pandas/core/reshape/merge.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0062427e1f77d..3eeeceaeb76ea 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -914,16 +914,29 @@ def _maybe_coerce_merge_keys(self): continue # check whether ints and floats - if is_integer_dtype(rk) and is_float_dtype(lk): + elif is_integer_dtype(rk) and is_float_dtype(lk): + if not (lk == lk.astype(rk.dtype)).all(): + warnings.warn('You are merging on int and float ' + 'columns where the float values ' + 'are not equal to their int ' + 'representation', UserWarning) continue - if is_float_dtype(rk) and is_integer_dtype(lk): + elif is_float_dtype(rk) and is_integer_dtype(lk): + if not (rk == rk.astype(lk.dtype)).all(): + warnings.warn('You are merging on int and float ' + 'columns where the float values ' + 'are not equal to their int ' + 'representation', UserWarning) continue # let's infer and see if we are ok - if lib.infer_dtype(lk) == lib.infer_dtype(rk): + elif lib.infer_dtype(lk) == lib.infer_dtype(rk): continue + else: + pass + # Houston, we have a problem! # let's coerce to object if the dtypes aren't # categorical, otherwise coerce to the category From 7f1db98647d059bda579feb5f3a666c3c976d846 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Mon, 20 Nov 2017 23:42:47 +0000 Subject: [PATCH 3/8] whatsnew --- doc/source/whatsnew/v0.22.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4a4d60b4dfbb2..ea099a2fce1d2 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -44,6 +44,7 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) +- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) .. _whatsnew_0220.api_breaking: From 9c3aea2cce7d33b4288cf919983403be5ffe941e Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 22 Nov 2017 20:21:50 +0000 Subject: [PATCH 4/8] test for merge results --- pandas/tests/reshape/test_merge.py | 36 +++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 265442af4dfa5..7d2578388e09b 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -16,7 +16,6 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_object_dtype, - is_float_dtype, ) from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm @@ -1412,24 +1411,41 @@ def test_join_multi_dtypes(self, d1, d2): expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('int_vals, float_vals', [ - ([1, 2, 3], [1.0, 2.0, 3.0]), - ([1, 2, 3], [1.0, 3.0]), - ([1, 2], [1.0, 2.0, 3.0]), - ([1, 2, 3], [1.1, 2.5, 3.0]), + @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [ + ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}), + ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}), + ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}), ]) - def test_merge_on_ints_floats(self, int_vals, float_vals): + def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): # GH 16572 # Check that float column is not cast to object if - # merging on float and int + # merging on float and int columns A = DataFrame({'X': int_vals}) B = DataFrame({'Y': float_vals}) + exp_res = DataFrame(exp_vals) res = A.merge(B, left_on='X', right_on='Y') - assert is_float_dtype(res['Y'].dtype) + assert_frame_equal(res, exp_res) res = B.merge(A, left_on='Y', right_on='X') - assert is_float_dtype(res['Y'].dtype) + assert_frame_equal(res, exp_res[['Y', 'X']]) + + def test_merge_on_ints_floats_warning(self): + # GH 16572 + # merge will produce a warning when merging on int and + # float columns where the float values are not exactly + # equal to their int representation + A = DataFrame({'X': [1, 2, 3]}) + B = DataFrame({'Y': [1.1, 2.5, 3.0]}) + exp_res = DataFrame({'X': [3], 'Y': [3.0]}) + + with tm.assert_produces_warning(UserWarning): + res = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(res, exp_res) + + with tm.assert_produces_warning(UserWarning): + res = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(res, exp_res[['Y', 'X']]) @pytest.fixture From d86c1fb1eefb1d2b484e22cf7ad803a227a563be Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 22 Nov 2017 20:22:31 +0000 Subject: [PATCH 5/8] add else statement --- pandas/core/reshape/merge.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3eeeceaeb76ea..f3e38dd7d1ffb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -911,7 +911,7 @@ def _maybe_coerce_merge_keys(self): # the same, then proceed if is_numeric_dtype(lk) and is_numeric_dtype(rk): if lk.dtype.kind == rk.dtype.kind: - continue + pass # check whether ints and floats elif is_integer_dtype(rk) and is_float_dtype(lk): @@ -920,7 +920,7 @@ def _maybe_coerce_merge_keys(self): 'columns where the float values ' 'are not equal to their int ' 'representation', UserWarning) - continue + pass elif is_float_dtype(rk) and is_integer_dtype(lk): if not (rk == rk.astype(lk.dtype)).all(): @@ -928,13 +928,10 @@ def _maybe_coerce_merge_keys(self): 'columns where the float values ' 'are not equal to their int ' 'representation', UserWarning) - continue + pass # let's infer and see if we are ok elif lib.infer_dtype(lk) == lib.infer_dtype(rk): - continue - - else: pass # Houston, we have a problem! @@ -944,14 +941,15 @@ def _maybe_coerce_merge_keys(self): # then we would lose type information on some # columns, and end up trying to merge # incompatible dtypes. See GH 16900. - if name in self.left.columns: - typ = lk.categories.dtype if lk_is_cat else object - self.left = self.left.assign( - **{name: self.left[name].astype(typ)}) - if name in self.right.columns: - typ = rk.categories.dtype if rk_is_cat else object - self.right = self.right.assign( - **{name: self.right[name].astype(typ)}) + else: + if name in self.left.columns: + typ = lk.categories.dtype if lk_is_cat else object + self.left = self.left.assign( + **{name: self.left[name].astype(typ)}) + if name in self.right.columns: + typ = rk.categories.dtype if rk_is_cat else object + self.right = self.right.assign( + **{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? From fc8ac2ce6b88399c3f99636d68575eb3ffb5ddfe Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 22 Nov 2017 23:29:20 +0000 Subject: [PATCH 6/8] update whatsnew and tests --- doc/source/whatsnew/v0.22.0.txt | 3 +-- pandas/tests/reshape/test_merge.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index ea099a2fce1d2..1a6327554f61a 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -44,7 +44,6 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) -- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) .. _whatsnew_0220.api_breaking: @@ -52,7 +51,7 @@ Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) -- +- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) - diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 7d2578388e09b..ee7c4e5c90bb8 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1422,13 +1422,13 @@ def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): # merging on float and int columns A = DataFrame({'X': int_vals}) B = DataFrame({'Y': float_vals}) - exp_res = DataFrame(exp_vals) + expected = DataFrame(exp_vals) - res = A.merge(B, left_on='X', right_on='Y') - assert_frame_equal(res, exp_res) + result = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(result, expected) - res = B.merge(A, left_on='Y', right_on='X') - assert_frame_equal(res, exp_res[['Y', 'X']]) + result = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(result, expected[['Y', 'X']]) def test_merge_on_ints_floats_warning(self): # GH 16572 @@ -1437,15 +1437,15 @@ def test_merge_on_ints_floats_warning(self): # equal to their int representation A = DataFrame({'X': [1, 2, 3]}) B = DataFrame({'Y': [1.1, 2.5, 3.0]}) - exp_res = DataFrame({'X': [3], 'Y': [3.0]}) + expected = DataFrame({'X': [3], 'Y': [3.0]}) with tm.assert_produces_warning(UserWarning): - res = A.merge(B, left_on='X', right_on='Y') - assert_frame_equal(res, exp_res) + result = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(result, expected) with tm.assert_produces_warning(UserWarning): - res = B.merge(A, left_on='Y', right_on='X') - assert_frame_equal(res, exp_res[['Y', 'X']]) + result = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(result, expected[['Y', 'X']]) @pytest.fixture From c46875b375f2667a5272f0d12c3b137a0210eb81 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 22 Nov 2017 23:29:32 +0000 Subject: [PATCH 7/8] remove some pass statements --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f3e38dd7d1ffb..0b330e566d23a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -920,7 +920,7 @@ def _maybe_coerce_merge_keys(self): 'columns where the float values ' 'are not equal to their int ' 'representation', UserWarning) - pass + elif is_float_dtype(rk) and is_integer_dtype(lk): if not (rk == rk.astype(lk.dtype)).all(): @@ -928,7 +928,7 @@ def _maybe_coerce_merge_keys(self): 'columns where the float values ' 'are not equal to their int ' 'representation', UserWarning) - pass + # let's infer and see if we are ok elif lib.infer_dtype(lk) == lib.infer_dtype(rk): From a3d6fe656df83f4e56363b5e90785d1f37448c24 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 22 Nov 2017 23:56:37 +0000 Subject: [PATCH 8/8] whitespace --- pandas/core/reshape/merge.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0b330e566d23a..d00aa1003988a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -921,7 +921,6 @@ def _maybe_coerce_merge_keys(self): 'are not equal to their int ' 'representation', UserWarning) - elif is_float_dtype(rk) and is_integer_dtype(lk): if not (rk == rk.astype(lk.dtype)).all(): warnings.warn('You are merging on int and float ' @@ -929,7 +928,6 @@ def _maybe_coerce_merge_keys(self): 'are not equal to their int ' 'representation', UserWarning) - # let's infer and see if we are ok elif lib.infer_dtype(lk) == lib.infer_dtype(rk): pass