From 34ebad832d6709ecd479c4db4705a9f81da015b3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 May 2017 09:36:51 +0200 Subject: [PATCH] PERF: improve MultiIndex get_loc performance (#16346) * PERF: improve hash collision check for single MI labels * PERF: specialized hash function for single tuples --- asv_bench/benchmarks/indexing.py | 12 +++++ doc/source/whatsnew/v0.20.2.txt | 3 +- pandas/_libs/hashtable.pxd | 2 + pandas/_libs/hashtable_class_helper.pxi.in | 19 +++++++- pandas/core/indexes/multi.py | 4 +- pandas/core/util/hashing.py | 56 +++++++++++++++++++++- pandas/tests/util/test_hashing.py | 24 +++++++++- 7 files changed, 113 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index e1676715853a4..6a2c9d48c4a28 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -227,12 +227,24 @@ def time_multiindex_get_indexer(self): def time_multiindex_large_get_loc(self): self.mi_large.get_loc((999, 19, 'Z')) + def time_multiindex_large_get_loc_warm(self): + for _ in range(1000): + self.mi_large.get_loc((999, 19, 'Z')) + def time_multiindex_med_get_loc(self): self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_med_get_loc_warm(self): + for _ in range(1000): + self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_string_get_loc(self): self.mi_small.get_loc((99, 'A', 'A')) + def time_multiindex_small_get_loc_warm(self): + for _ in range(1000): + self.mi_small.get_loc((99, 'A', 'A')) + def time_is_monotonic(self): self.miint.is_monotonic diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 10a6b4354290d..7773f5abfb0ba 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -27,9 +27,10 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance regression fix when indexing with a list-like (:issue:`16285`) -- Performance regression fix for small MultiIndexes (:issuse:`16319`) +- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) + .. _whatsnew_0202.bug_fixes: Bug Fixes diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 3366751af144d..014da22df3382 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) + cdef inline void _check_for_collision(self, Py_ssize_t loc, object label) + cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b80a592669eca..3ef52c5c59c9d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -4,6 +4,9 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +from lib cimport is_null_datetimelike + + #---------------------------------------------------------------------- # VectorData #---------------------------------------------------------------------- @@ -921,6 +924,19 @@ cdef class MultiIndexHashTable(HashTable): "hash collision\nlocs:\n{}\n" "result:\n{}\nmi:\n{}".format(alocs, result, mi)) + cdef inline void _check_for_collision(self, Py_ssize_t loc, object label): + # validate that the loc maps to the actual value + # version of _check_for_collisions above for single label (tuple) + + result = self.mi[loc] + + if not all(l == r or (is_null_datetimelike(l) + and is_null_datetimelike(r)) + for l, r in zip(result, label)): + raise AssertionError( + "hash collision\nloc:\n{}\n" + "result:\n{}\nmi:\n{}".format(loc, result, label)) + def __contains__(self, object key): try: self.get_item(key) @@ -939,8 +955,7 @@ cdef class MultiIndexHashTable(HashTable): k = kh_get_uint64(self.table, value) if k != self.table.n_buckets: loc = self.table.vals[k] - locs = np.array([loc], dtype=np.int64) - self._check_for_collisions(locs, key) + self._check_for_collision(loc, key) return loc else: raise KeyError(key) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3db5633ec30bd..569e16f2141ae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -748,7 +748,7 @@ def _hashed_indexing_key(self, key): we need to stringify if we have mixed levels """ - from pandas.core.util.hashing import hash_tuples + from pandas.core.util.hashing import hash_tuples, hash_tuple if not isinstance(key, tuple): return hash_tuples(key) @@ -762,7 +762,7 @@ def f(k, stringify): return k key = tuple([f(k, stringify) for k, stringify in zip(key, self._have_mixed_levels)]) - return hash_tuples(key) + return hash_tuple(key) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index f0829adc94500..e41ffae9d03c2 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -4,7 +4,7 @@ import itertools import numpy as np -from pandas._libs import hashing +from pandas._libs import hashing, tslib from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCIndexClass, @@ -12,6 +12,9 @@ ABCDataFrame) from pandas.core.dtypes.common import ( is_categorical_dtype, is_list_like) +from pandas.core.dtypes.missing import isnull +from pandas.core.dtypes.cast import infer_dtype_from_scalar + # 16 byte long hashing key _default_hash_key = '0123456789123456' @@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): return h +def hash_tuple(val, encoding='utf8', hash_key=None): + """ + Hash a single tuple efficiently + + Parameters + ---------- + val : single tuple + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + hash + + """ + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) + for v in val) + + h = _combine_hash_arrays(hashes, len(val))[0] + + return h + + def _hash_categorical(c, encoding, hash_key): """ Hash a Categorical by hashing its categories, and then mapping the codes @@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals + + +def _hash_scalar(val, encoding='utf8', hash_key=None): + """ + Hash scalar value + + Returns + ------- + 1d uint64 numpy array of hash value, of length 1 + """ + + if isnull(val): + # this is to be consistent with the _hash_categorical implementation + return np.array([np.iinfo(np.uint64).max], dtype='u8') + + if getattr(val, 'tzinfo', None) is not None: + # for tz-aware datetimes, we need the underlying naive UTC value and + # not the tz aware object or pd extension type (as + # infer_dtype_from_scalar would do) + if not isinstance(val, tslib.Timestamp): + val = tslib.Timestamp(val) + val = val.tz_convert(None) + + dtype, val = infer_dtype_from_scalar(val) + vals = np.array([val], dtype=dtype) + + return hash_array(vals, hash_key=hash_key, encoding=encoding, + categorize=False) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index e1e6e43529a7d..289592939e3da 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -1,4 +1,5 @@ import pytest +import datetime from warnings import catch_warnings import numpy as np @@ -6,7 +7,7 @@ from pandas import DataFrame, Series, Index, MultiIndex from pandas.util import hash_array, hash_pandas_object -from pandas.core.util.hashing import hash_tuples +from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar import pandas.util.testing as tm @@ -79,6 +80,27 @@ def test_hash_tuples(self): result = hash_tuples(tups[0]) assert result == expected[0] + def test_hash_tuple(self): + # test equivalence between hash_tuples and hash_tuple + for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), + ('A', pd.Timestamp("2012-01-01"))]: + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + assert result == expected + + def test_hash_scalar(self): + for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz='Europe/Brussels'), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), + pd.Timedelta('1 days'), datetime.timedelta(1), + pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), + np.nan, pd.NaT, None]: + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), + categorize=True) + assert result[0] == expected[0] + def test_hash_tuples_err(self): for val in [5, 'foo', pd.Timestamp('20130101')]: