Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Keep float dtype in merge on int and float column #18352

Merged
merged 8 commits into from
Nov 23, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`)
-
- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`)
-


Expand Down
40 changes: 28 additions & 12 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,16 +906,31 @@ def _maybe_coerce_merge_keys(self):
continue

# if we are numeric, then allow differing
# kinds to proceed, eg. int64 and int8
# kinds to proceed, eg. int64 and int8, int and float
# further if we are object, but we infer to
# the same, then proceed
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
if lk.dtype.kind == rk.dtype.kind:
continue
pass

# check whether ints and floats
elif is_integer_dtype(rk) and is_float_dtype(lk):
if not (lk == lk.astype(rk.dtype)).all():
warnings.warn('You are merging on int and float '
'columns where the float values '
'are not equal to their int '
'representation', UserWarning)

elif is_float_dtype(rk) and is_integer_dtype(lk):
if not (rk == rk.astype(lk.dtype)).all():
warnings.warn('You are merging on int and float '
'columns where the float values '
'are not equal to their int '
'representation', UserWarning)

# let's infer and see if we are ok
if lib.infer_dtype(lk) == lib.infer_dtype(rk):
continue
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this ones good!


# Houston, we have a problem!
# let's coerce to object if the dtypes aren't
Expand All @@ -924,14 +939,15 @@ def _maybe_coerce_merge_keys(self):
# then we would lose type information on some
# columns, and end up trying to merge
# incompatible dtypes. See GH 16900.
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(
**{name: self.right[name].astype(typ)})
else:
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(
**{name: self.right[name].astype(typ)})

def _validate_specification(self):
# Hm, any way to make this logic less complicated??
Expand Down
41 changes: 40 additions & 1 deletion pandas/tests/reshape/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
from pandas.core.reshape.merge import merge, MergeError
from pandas.util.testing import assert_frame_equal, assert_series_equal
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_object_dtype,
)
from pandas import DataFrame, Index, MultiIndex, Series, Categorical
import pandas.util.testing as tm
from pandas.api.types import CategoricalDtype as CDT
Expand Down Expand Up @@ -1408,6 +1411,42 @@ def test_join_multi_dtypes(self, d1, d2):
expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('int_vals, float_vals, exp_vals', [
([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}),
([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}),
([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}),
])
def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals):
# GH 16572
# Check that float column is not cast to object if
# merging on float and int columns
A = DataFrame({'X': int_vals})
B = DataFrame({'Y': float_vals})
expected = DataFrame(exp_vals)

result = A.merge(B, left_on='X', right_on='Y')
assert_frame_equal(result, expected)

result = B.merge(A, left_on='Y', right_on='X')
assert_frame_equal(result, expected[['Y', 'X']])

def test_merge_on_ints_floats_warning(self):
# GH 16572
# merge will produce a warning when merging on int and
# float columns where the float values are not exactly
# equal to their int representation
A = DataFrame({'X': [1, 2, 3]})
B = DataFrame({'Y': [1.1, 2.5, 3.0]})
expected = DataFrame({'X': [3], 'Y': [3.0]})

with tm.assert_produces_warning(UserWarning):
result = A.merge(B, left_on='X', right_on='Y')
assert_frame_equal(result, expected)

with tm.assert_produces_warning(UserWarning):
result = B.merge(A, left_on='Y', right_on='X')
assert_frame_equal(result, expected[['Y', 'X']])


@pytest.fixture
def left():
Expand Down