Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: merge_asof() has left_index/right_index and left_by/right_by (#14253) #14426

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ Highlights include:
:backlinks: none



.. _whatsnew_0191.api.other:

Other API Changes
^^^^^^^^^^^^^^^^^

- ``pd.merge_asof()`` can take ``left_index``/``right_index`` and ``left_by``/``right_by`` (:issue:`14253`)




.. _whatsnew_0191.performance:

Performance Improvements
Expand Down
120 changes: 93 additions & 27 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ def _merger(x, y):

def merge_asof(left, right, on=None,
left_on=None, right_on=None,
by=None,
left_index=False, right_index=False,
by=None, left_by=None, right_by=None,
suffixes=('_x', '_y'),
tolerance=None,
allow_exact_matches=True):
Expand Down Expand Up @@ -288,9 +289,29 @@ def merge_asof(left, right, on=None,
Field name to join on in left DataFrame.
right_on : label
Field name to join on in right DataFrame.
left_index : boolean
Use the index of the left DataFrame as the join key.

.. versionadded:: 0.19.1

right_index : boolean
Use the index of the right DataFrame as the join key.

.. versionadded:: 0.19.1

by : column name
Group both the left and right DataFrames by the group column; perform
the merge operation on these pieces and recombine.
left_by : column name
Field name to group by in the left DataFrame.

.. versionadded:: 0.19.1

right_by : column name
Field name to group by in the right DataFrame.

.. versionadded:: 0.19.1

suffixes : 2-length sequence (tuple, list, ...)
Suffix to apply to overlapping column names in the left and right
side, respectively
Expand Down Expand Up @@ -418,7 +439,9 @@ def merge_asof(left, right, on=None,
"""
op = _AsOfMerge(left, right,
on=on, left_on=left_on, right_on=right_on,
by=by, suffixes=suffixes,
left_index=left_index, right_index=right_index,
by=by, left_by=left_by, right_by=right_by,
suffixes=suffixes,
how='asof', tolerance=tolerance,
allow_exact_matches=allow_exact_matches)
return op.get_result()
Expand Down Expand Up @@ -641,7 +664,7 @@ def _get_join_info(self):
left_ax = self.left._data.axes[self.axis]
right_ax = self.right._data.axes[self.axis]

if self.left_index and self.right_index:
if self.left_index and self.right_index and self.how != 'asof':
join_index, left_indexer, right_indexer = \
left_ax.join(right_ax, how=self.how, return_indexers=True)
elif self.right_index and self.how == 'left':
Expand Down Expand Up @@ -731,21 +754,35 @@ def _get_merge_keys(self):
right_keys.append(rk)
join_names.append(None) # what to do?
else:
right_keys.append(right[rk]._values)
join_names.append(rk)
if rk is not None:
right_keys.append(right[rk]._values)
join_names.append(rk)
else:
# kludge for merge_asof(right_index=True)
right_keys.append(right.index.values)
join_names.append(right.index.name)
else:
if not is_rkey(rk):
right_keys.append(right[rk]._values)
if lk == rk:
if rk is not None:
right_keys.append(right[rk]._values)
else:
# kludge for merge_asof(right_index=True)
right_keys.append(right.index.values)
if lk is not None and lk == rk:
# avoid key upcast in corner case (length-0)
if len(left) > 0:
right_drop.append(rk)
else:
left_drop.append(lk)
else:
right_keys.append(rk)
left_keys.append(left[lk]._values)
join_names.append(lk)
if lk is not None:
left_keys.append(left[lk]._values)
join_names.append(lk)
else:
# kludge for merge_asof(left_index=True)
left_keys.append(left.index.values)
join_names.append(left.index.name)
elif _any(self.left_on):
for k in self.left_on:
if is_lkey(k):
Expand Down Expand Up @@ -870,13 +907,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
class _OrderedMerge(_MergeOperation):
_merge_type = 'ordered_merge'

def __init__(self, left, right, on=None, left_on=None,
right_on=None, axis=1,
def __init__(self, left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, axis=1,
suffixes=('_x', '_y'), copy=True,
fill_method=None, how='outer'):

self.fill_method = fill_method
_MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
left_index=left_index,
right_index=right_index,
right_on=right_on, axis=axis,
how=how, suffixes=suffixes,
sort=True # factorize sorts
Expand Down Expand Up @@ -949,43 +988,68 @@ def _get_cython_type(dtype):
class _AsOfMerge(_OrderedMerge):
_merge_type = 'asof_merge'

def __init__(self, left, right, on=None, by=None, left_on=None,
right_on=None, axis=1,
suffixes=('_x', '_y'), copy=True,
def __init__(self, left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False,
by=None, left_by=None, right_by=None,
axis=1, suffixes=('_x', '_y'), copy=True,
fill_method=None,
how='asof', tolerance=None,
allow_exact_matches=True):

self.by = by
self.left_by = left_by
self.right_by = right_by
self.tolerance = tolerance
self.allow_exact_matches = allow_exact_matches

_OrderedMerge.__init__(self, left, right, on=on, left_on=left_on,
right_on=right_on, axis=axis,
right_on=right_on, left_index=left_index,
right_index=right_index, axis=axis,
how=how, suffixes=suffixes,
fill_method=fill_method)

def _validate_specification(self):
super(_AsOfMerge, self)._validate_specification()

# we only allow on to be a single item for on
if len(self.left_on) != 1:
if len(self.left_on) != 1 and not self.left_index:
raise MergeError("can only asof on a key for left")

if len(self.right_on) != 1:
if len(self.right_on) != 1 and not self.right_index:
raise MergeError("can only asof on a key for right")

if self.left_index and isinstance(self.left.index, MultiIndex):
raise MergeError("left can only have one index")

if self.right_index and isinstance(self.right.index, MultiIndex):
raise MergeError("right can only have one index")

# set 'by' columns
if self.by is not None:
if self.left_by is not None or self.right_by is not None:
raise MergeError('Can only pass by OR left_by '
'and right_by')
self.left_by = self.right_by = self.by
if self.left_by is None and self.right_by is not None:
raise MergeError('missing left_by')
if self.left_by is not None and self.right_by is None:
raise MergeError('missing right_by')

# add by to our key-list so we can have it in the
# output as a key
if self.by is not None:
if not is_list_like(self.by):
self.by = [self.by]
if self.left_by is not None:
if not is_list_like(self.left_by):
self.left_by = [self.left_by]
if not is_list_like(self.right_by):
self.right_by = [self.right_by]

if len(self.by) != 1:
if len(self.left_by) != 1:
raise MergeError("can only asof by a single key")
if len(self.right_by) != 1:
raise MergeError("can only asof by a single key")

self.left_on = self.by + list(self.left_on)
self.right_on = self.by + list(self.right_on)
self.left_on = self.left_by + list(self.left_on)
self.right_on = self.right_by + list(self.right_on)

@property
def _asof_key(self):
Expand All @@ -1008,7 +1072,7 @@ def _get_merge_keys(self):
# validate tolerance; must be a Timedelta if we have a DTI
if self.tolerance is not None:

lt = left_join_keys[self.left_on.index(self._asof_key)]
lt = left_join_keys[-1]
msg = "incompatible tolerance, must be compat " \
"with type {0}".format(type(lt))

Expand Down Expand Up @@ -1038,8 +1102,10 @@ def _get_join_indexers(self):
""" return the join indexers """

# values to compare
left_values = self.left_join_keys[-1]
right_values = self.right_join_keys[-1]
left_values = (self.left.index.values if self.left_index else
self.left_join_keys[-1])
right_values = (self.right.index.values if self.right_index else
self.right_join_keys[-1])
tolerance = self.tolerance

# we required sortedness in the join keys
Expand All @@ -1057,7 +1123,7 @@ def _get_join_indexers(self):
tolerance = tolerance.value

# a "by" parameter requires special handling
if self.by is not None:
if self.left_by is not None:
left_by_values = self.left_join_keys[0]
right_by_values = self.right_join_keys[0]

Expand Down
69 changes: 69 additions & 0 deletions pandas/tools/tests/test_merge_asof.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,75 @@ def test_basic_categorical(self):
by='ticker')
assert_frame_equal(result, expected)

def test_basic_left_index(self):

expected = self.asof
trades = self.trades.set_index('time')
quotes = self.quotes

result = merge_asof(trades, quotes,
left_index=True,
right_on='time',
by='ticker')
# left-only index uses right's index, oddly
expected.index = result.index
# time column appears after left's columns
expected = expected[result.columns]
assert_frame_equal(result, expected)

def test_basic_right_index(self):

expected = self.asof
trades = self.trades
quotes = self.quotes.set_index('time')

result = merge_asof(trades, quotes,
left_on='time',
right_index=True,
by='ticker')
assert_frame_equal(result, expected)

def test_basic_left_index_right_index(self):

expected = self.asof.set_index('time')
trades = self.trades.set_index('time')
quotes = self.quotes.set_index('time')

result = merge_asof(trades, quotes,
left_index=True,
right_index=True,
by='ticker')
assert_frame_equal(result, expected)

def test_multi_index(self):

# MultiIndex is prohibited
trades = self.trades.set_index(['time', 'price'])
quotes = self.quotes.set_index('time')
with self.assertRaises(MergeError):
merge_asof(trades, quotes,
left_index=True,
right_index=True)

trades = self.trades.set_index('time')
quotes = self.quotes.set_index(['time', 'bid'])
with self.assertRaises(MergeError):
merge_asof(trades, quotes,
left_index=True,
right_index=True)

def test_basic_left_by_right_by(self):

expected = self.asof
trades = self.trades
quotes = self.quotes

result = merge_asof(trades, quotes,
on='time',
left_by='ticker',
right_by='ticker')
assert_frame_equal(result, expected)

def test_missing_right_by(self):

expected = self.asof
Expand Down