From 40b6cbb7c9c0f3517315123c9762286134c5c0a6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 22 May 2017 06:37:03 -0400 Subject: [PATCH] PERF: don't materialize arrays on checking in groupby --- asv_bench/benchmarks/groupby.py | 9 +++++++++ doc/source/whatsnew/v0.20.2.txt | 2 +- pandas/core/indexes/base.py | 1 - 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c0c3a42cc4464d..13b5cd2b060322 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -368,6 +368,11 @@ def setup(self): self.dates = (np.datetime64('now') + self.offsets) self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + N = 1000000 + self.draws = pd.Series(np.random.randn(N)) + labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4)) + self.cats = labels.astype('category') + def time_groupby_multi_size(self): self.df.groupby(['key1', 'key2']).size() @@ -377,6 +382,10 @@ def time_groupby_dt_size(self): def time_groupby_dt_timegrouper_size(self): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + def time_groupby_size(self): + self.draws.groupby(self.cats).size() + + #---------------------------------------------------------------------- # groupby with a variable value for ngroups diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 4ec9daff4c0fcc..b0bb1485a789a9 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -29,7 +29,7 @@ Performance Improvements - Performance regression fix when indexing with a list-like (:issue:`16285`) - Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) - +- Improved performance of groupby with categoricals groupers (:issue:``) .. _whatsnew_0202.bug_fixes: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9b29f1b04ff73a..2af4f112ca9414 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2388,7 +2388,6 @@ def get_loc(self, key, method=None, tolerance=None): if tolerance is not None: raise ValueError('tolerance argument only valid if using pad, ' 'backfill or nearest lookups') - key = _values_from_object(key) try: return self._engine.get_loc(key) except KeyError: