PERF: improve MultiIndex get_loc performance (pandas-dev#16346)

* PERF: improve hash collision check for single MI labels * PERF: specialized hash function for single tuples
pcluo · May 22, 2017 · 1fa8201 · 1fa8201
1 parent 425bc73
commit 1fa8201
Show file tree

Hide file tree

Showing 7 changed files with 113 additions and 7 deletions.
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -227,12 +227,24 @@ def time_multiindex_get_indexer(self):
     def time_multiindex_large_get_loc(self):
         self.mi_large.get_loc((999, 19, 'Z'))
 
+    def time_multiindex_large_get_loc_warm(self):
+        for _ in range(1000):
+            self.mi_large.get_loc((999, 19, 'Z'))
+
     def time_multiindex_med_get_loc(self):
         self.mi_med.get_loc((999, 9, 'A'))
 
+    def time_multiindex_med_get_loc_warm(self):
+        for _ in range(1000):
+            self.mi_med.get_loc((999, 9, 'A'))
+
     def time_multiindex_string_get_loc(self):
         self.mi_small.get_loc((99, 'A', 'A'))
 
+    def time_multiindex_small_get_loc_warm(self):
+        for _ in range(1000):
+            self.mi_small.get_loc((99, 'A', 'A'))
+
     def time_is_monotonic(self):
         self.miint.is_monotonic
 

diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -27,9 +27,10 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Performance regression fix when indexing with a list-like (:issue:`16285`)
-- Performance regression fix for small MultiIndexes (:issuse:`16319`)
+- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
 - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)
 
+
 .. _whatsnew_0202.bug_fixes:
 
 Bug Fixes

diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable):
 
     cpdef get_item(self, object val)
     cpdef set_item(self, object key, Py_ssize_t val)
+    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
+
 
 cdef class StringHashTable(HashTable):
     cdef kh_str_t *table

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -4,6 +4,9 @@ Template for each `dtype` helper function for hashtable
 WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
 
+from lib cimport is_null_datetimelike
+
+
 #----------------------------------------------------------------------
 # VectorData
 #----------------------------------------------------------------------
@@ -921,6 +924,19 @@ cdef class MultiIndexHashTable(HashTable):
                     "hash collision\nlocs:\n{}\n"
                     "result:\n{}\nmi:\n{}".format(alocs, result, mi))
 
+    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
+        # validate that the loc maps to the actual value
+        # version of _check_for_collisions above for single label (tuple)
+
+        result = self.mi[loc]
+
+        if not all(l == r or (is_null_datetimelike(l)
+                              and is_null_datetimelike(r))
+                   for l, r in zip(result, label)):
+            raise AssertionError(
+                "hash collision\nloc:\n{}\n"
+                "result:\n{}\nmi:\n{}".format(loc, result, label))
+
     def __contains__(self, object key):
         try:
             self.get_item(key)
@@ -939,8 +955,7 @@ cdef class MultiIndexHashTable(HashTable):
         k = kh_get_uint64(self.table, value)
         if k != self.table.n_buckets:
             loc = self.table.vals[k]
-            locs = np.array([loc], dtype=np.int64)
-            self._check_for_collisions(locs, key)
+            self._check_for_collision(loc, key)
             return loc
         else:
             raise KeyError(key)

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -748,7 +748,7 @@ def _hashed_indexing_key(self, key):
         we need to stringify if we have mixed levels
 
         """
-        from pandas.core.util.hashing import hash_tuples
+        from pandas.core.util.hashing import hash_tuples, hash_tuple
 
         if not isinstance(key, tuple):
             return hash_tuples(key)
@@ -762,7 +762,7 @@ def f(k, stringify):
             return k
         key = tuple([f(k, stringify)
                      for k, stringify in zip(key, self._have_mixed_levels)])
-        return hash_tuples(key)
+        return hash_tuple(key)
 
     @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
     def duplicated(self, keep='first'):

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -4,14 +4,17 @@
 import itertools
 
 import numpy as np
-from pandas._libs import hashing
+from pandas._libs import hashing, tslib
 from pandas.core.dtypes.generic import (
     ABCMultiIndex,
     ABCIndexClass,
     ABCSeries,
     ABCDataFrame)
 from pandas.core.dtypes.common import (
     is_categorical_dtype, is_list_like)
+from pandas.core.dtypes.missing import isnull
+from pandas.core.dtypes.cast import infer_dtype_from_scalar
+
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
@@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
     return h
 
 
+def hash_tuple(val, encoding='utf8', hash_key=None):
+    """
+    Hash a single tuple efficiently
+
+    Parameters
+    ----------
+    val : single tuple
+    encoding : string, default 'utf8'
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    hash
+
+    """
+    hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
+              for v in val)
+
+    h = _combine_hash_arrays(hashes, len(val))[0]
+
+    return h
+
+
 def _hash_categorical(c, encoding, hash_key):
     """
     Hash a Categorical by hashing its categories, and then mapping the codes
@@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     vals *= np.uint64(0x94d049bb133111eb)
     vals ^= vals >> 31
     return vals
+
+
+def _hash_scalar(val, encoding='utf8', hash_key=None):
+    """
+    Hash scalar value
+
+    Returns
+    -------
+    1d uint64 numpy array of hash value, of length 1
+    """
+
+    if isnull(val):
+        # this is to be consistent with the _hash_categorical implementation
+        return np.array([np.iinfo(np.uint64).max], dtype='u8')
+
+    if getattr(val, 'tzinfo', None) is not None:
+        # for tz-aware datetimes, we need the underlying naive UTC value and
+        # not the tz aware object or pd extension type (as
+        # infer_dtype_from_scalar would do)
+        if not isinstance(val, tslib.Timestamp):
+            val = tslib.Timestamp(val)
+        val = val.tz_convert(None)
+
+    dtype, val = infer_dtype_from_scalar(val)
+    vals = np.array([val], dtype=dtype)
+
+    return hash_array(vals, hash_key=hash_key, encoding=encoding,
+                      categorize=False)
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
@@ -1,12 +1,13 @@
 import pytest
+import datetime
 
 from warnings import catch_warnings
 import numpy as np
 import pandas as pd
 
 from pandas import DataFrame, Series, Index, MultiIndex
 from pandas.util import hash_array, hash_pandas_object
-from pandas.core.util.hashing import hash_tuples
+from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar
 import pandas.util.testing as tm
 
 
@@ -79,6 +80,27 @@ def test_hash_tuples(self):
         result = hash_tuples(tups[0])
         assert result == expected[0]
 
+    def test_hash_tuple(self):
+        # test equivalence between hash_tuples and hash_tuple
+        for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
+                    ('A', pd.Timestamp("2012-01-01"))]:
+            result = hash_tuple(tup)
+            expected = hash_tuples([tup])[0]
+            assert result == expected
+
+    def test_hash_scalar(self):
+        for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
+                    pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
+                    datetime.datetime(2012, 1, 1),
+                    pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
+                    pd.Timedelta('1 days'), datetime.timedelta(1),
+                    pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
+                    np.nan, pd.NaT, None]:
+            result = _hash_scalar(val)
+            expected = hash_array(np.array([val], dtype=object),
+                                  categorize=True)
+            assert result[0] == expected[0]
+
     def test_hash_tuples_err(self):
 
         for val in [5, 'foo', pd.Timestamp('20130101')]: