From 645b77fcd23db79f7e8407db8bf0cd4a3303fcec Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 27 Nov 2017 00:01:07 -0800 Subject: [PATCH 1/3] CLN: ASV frame_methods benchmark Move relevant benchmark to other files Add more cleaning --- asv_bench/benchmarks/frame_methods.py | 631 ++++++++++---------------- asv_bench/benchmarks/indexing.py | 66 +++ asv_bench/benchmarks/strings.py | 12 + 3 files changed, 326 insertions(+), 383 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 53ee4d8019938..9a6221b9fd6d2 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,13 +1,16 @@ -from .pandas_vb_common import * import string +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + isnull, NaT) -#---------------------------------------------------------------------- -# get_numeric_data -class frame_get_numeric_data(object): +class GetNumericData(object): + goal_time = 0.2 def setup(self): + np.random.seed(1234) self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' @@ -16,19 +19,22 @@ def setup(self): def time_frame_get_numeric_data(self): self.df._get_numeric_data() -#---------------------------------------------------------------------- -# lookup -class frame_fancy_lookup(object): +class Lookup(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + np.random.seed(1234) + self.df = DataFrame(np.random.randn(10000, 8), + columns=list('abcdefgh')) self.df['foo'] = 'bar' self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') + self.col_labels = list(self.df.columns) * 100 + self.row_labels_all = np.array( + list(self.df.index) * len(self.df.columns), dtype='object') + self.col_labels_all = np.array( + list(self.df.columns) * len(self.df.index), dtype='object') def time_frame_fancy_lookup(self): self.df.lookup(self.row_labels, self.col_labels) @@ -37,25 +43,21 @@ def time_frame_fancy_lookup_all(self): self.df.lookup(self.row_labels_all, self.col_labels_all) -#---------------------------------------------------------------------- -# reindex - class Reindex(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.idx = np.arange(4000, 7000) - + N = 10**3 + np.random.seed(1234) + self.df = DataFrame(np.random.randn(N * 10, N)) + self.idx = np.arange(4 * N, 7 * N) self.df2 = DataFrame( - dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), - 1: randint(0, 1000, 1000).astype( - np.int16), - 2: randint(0, 1000, 1000).astype( - np.int32), - 3: randint(0, 1000, 1000).astype( - np.int64),}[randint(0, 4)]) for c in - range(1000)])) + {c: {0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64)} + [np.random.randint(0, 4)] for c in range(N)}) def time_reindex_axis0(self): self.df.reindex(self.idx) @@ -67,81 +69,85 @@ def time_reindex_both_axes(self): self.df.reindex(index=self.idx, columns=self.idx) def time_reindex_both_axes_ix(self): - self.df.ix[(self.idx, self.idx)] + self.df.ix[self.idx, self.idx] def time_reindex_upcast(self): + np.random.seed(1234) self.df2.reindex(np.random.permutation(range(1200))) -#---------------------------------------------------------------------- -# iteritems (monitor no-copying behaviour) - class Iteration(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(np.random.randn(50000, 10)) - self.df3 = pd.DataFrame(np.random.randn(1000,5000), - columns=['C'+str(c) for c in range(5000)]) + N = 1000 + np.random.seed(1234) + self.df = DataFrame(np.random.randn(N * 10, N)) + self.df2 = DataFrame(np.random.randn(N * 50, 10)) + self.df3 = DataFrame(np.random.randn(N, 5 * N), + columns=['C' + str(c) for c in range(N * 5)]) - def f(self): + def time_iteritems(self): + # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass - - def g(self): - for (name, col) in self.df.iteritems(): + for name, col in self.df.iteritems(): pass - def time_iteritems(self): - self.f() - def time_iteritems_cached(self): - self.g() + for name, col in self.df.iteritems(): + pass def time_iteritems_indexing(self): - df = self.df3 - for col in df: - df[col] + for col in self.df3: + self.df3[col] def time_itertuples(self): for row in self.df2.itertuples(): pass -#---------------------------------------------------------------------- -# to_string, to_html, repr +class ToString(object): -class Formatting(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(100, 10)) + np.random.seed(1234) + self.df = DataFrame(np.random.randn(100, 10)) - self.nrows = 500 - self.df2 = DataFrame(randn(self.nrows, 10)) - self.df2[0] = period_range('2000', '2010', self.nrows) - self.df2[1] = range(self.nrows) + def time_to_string_floats(self): + self.df.to_string() - self.nrows = 10000 - self.data = randn(self.nrows, 10) - self.idx = MultiIndex.from_arrays(np.tile(randn(3, int(self.nrows / 100)), 100)) - self.df3 = DataFrame(self.data, index=self.idx) - self.idx = randn(self.nrows) - self.df4 = DataFrame(self.data, index=self.idx) - self.df_tall = pandas.DataFrame(np.random.randn(10000, 10)) +class ToHTML(object): - self.df_wide = pandas.DataFrame(np.random.randn(10, 10000)) + goal_time = 0.2 - def time_to_string_floats(self): - self.df.to_string() + def setup(self): + nrows = 500 + self.df2 = DataFrame(np.random.randn(nrows, 10)) + self.df2[0] = period_range('2000', '2010', nrows) + self.df2[1] = range(nrows) def time_to_html_mixed(self): self.df2.to_html() + +class Repr(object): + + goal_time = 0.2 + + def setup(self): + nrows = 10000 + data = np.random.randn(nrows, 10) + idx = MultiIndex.from_arrays(np.tile(np.random.randn(3, nrows / 100), + 100)) + self.df3 = DataFrame(data, index=idx) + self.df4 = DataFrame(data, index=np.random.randn(nrows)) + self.df_tall = DataFrame(np.random.randn(nrows, 10)) + self.df_wide = DataFrame(np.random.randn(10, nrows)) + def time_html_repr_trunc_mi(self): self.df3._repr_html_() @@ -155,21 +161,17 @@ def time_frame_repr_wide(self): repr(self.df_wide) -#---------------------------------------------------------------------- -# nulls/masking - - -## masking +class MaskBool(object): -class frame_mask_bools(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + np.random.seed(1234) + data = np.random.randn(1000, 500) + df = DataFrame(data) + df = df.where(df > 0) + self.bools = df > 0 + self.mask = isnull(df) def time_frame_mask_bools(self): self.bools.mask(self.mask) @@ -178,31 +180,27 @@ def time_frame_mask_floats(self): self.bools.astype(float).mask(self.mask) -## isnull +class Isnull(object): -class FrameIsnull(object): goal_time = 0.2 def setup(self): - self.df_no_null = DataFrame(np.random.randn(1000, 1000)) - + N = 10**3 np.random.seed(1234) - self.sample = np.array([np.nan, 1.0]) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df = DataFrame(self.data) + self.df_no_null = DataFrame(np.random.randn(N, N)) - np.random.seed(1234) - self.sample = np.array(list(string.ascii_lowercase) + - list(string.ascii_uppercase) + - list(string.whitespace)) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df_strings= DataFrame(self.data) + sample = np.array([np.nan, 1.0]) + data = np.random.choice(sample, (N, N)) + self.df = DataFrame(data) - np.random.seed(1234) - self.sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), - np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df_obj = DataFrame(self.data) + sample = np.array(list(string.ascii_letters + string.whitespace)) + data = np.random.choice(sample, (N, N)) + self.df_strings = DataFrame(data) + + sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), + np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) + data = np.random.choice(sample, (N, N)) + self.df_obj = DataFrame(data) def time_isnull_floats_no_null(self): isnull(self.df_no_null) @@ -217,458 +215,325 @@ def time_isnull_obj(self): isnull(self.df_obj) -# ---------------------------------------------------------------------- -# fillna in place - -class frame_fillna_inplace(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan - - def time_frame_fillna_inplace(self): - self.df.fillna(0, inplace=True) - +class Fillna(object): - -class frame_fillna_many_columns_pad(object): goal_time = 0.2 + params = ([True, False], ['pad', 'bfill']) + param_names = ['inplace', 'method'] - def setup(self): - self.values = np.random.randn(1000, 1000) - self.values[::2] = np.nan - self.df = DataFrame(self.values) - - def time_frame_fillna_many_columns_pad(self): - self.df.fillna(method='pad') + def setup(self, inplace, method): + np.random.seed(1234) + values = np.random.randn(10000, 100) + values[::2] = np.nan + self.df = DataFrame(values) + def time_frame_fillna(self, inplace, method): + self.df.fillna(inplace=inplace, method=method) class Dropna(object): + goal_time = 0.2 + params = (['all', 'any'], [0, 1]) + param_names = ['how', 'axis'] - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) + def setup(self, how, axis): + np.random.seed(1234) + self.df = DataFrame(np.random.randn(10000, 1000)) self.df.ix[50:1000, 20:50] = np.nan self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed['foo'] = 'bar' - self.df_mi = self.df.copy() - self.df_mi.index = MultiIndex.from_tuples(self.df_mi.index.map((lambda x: (x, x)))) - self.df_mi.columns = MultiIndex.from_tuples(self.df_mi.columns.map((lambda x: (x, x)))) - - self.df_mixed_mi = self.df_mixed.copy() - self.df_mixed_mi.index = MultiIndex.from_tuples(self.df_mixed_mi.index.map((lambda x: (x, x)))) - self.df_mixed_mi.columns = MultiIndex.from_tuples(self.df_mixed_mi.columns.map((lambda x: (x, x)))) - - def time_dropna_axis0_all(self): - self.df.dropna(how='all', axis=0) - - def time_dropna_axis0_any(self): - self.df.dropna(how='any', axis=0) + def time_dropna(self, how, axis): + self.df.dropna(how=how, axis=axis) - def time_dropna_axis1_all(self): - self.df.dropna(how='all', axis=1) + def time_dropna_axis_mixed_dtypes(self, how, axis): + self.df_mixed.dropna(how=how, axis=axis) - def time_dropna_axis1_any(self): - self.df.dropna(how='any', axis=1) - def time_dropna_axis0_all_mixed_dtypes(self): - self.df_mixed.dropna(how='all', axis=0) +class Count(object): - def time_dropna_axis0_any_mixed_dtypes(self): - self.df_mixed.dropna(how='any', axis=0) - - def time_dropna_axis1_all_mixed_dtypes(self): - self.df_mixed.dropna(how='all', axis=1) + goal_time = 0.2 - def time_dropna_axis1_any_mixed_dtypes(self): - self.df_mixed.dropna(how='any', axis=1) + params = [0, 1] + param_names = ['axis'] - def time_count_level_axis0_multi(self): - self.df_mi.count(axis=0, level=1) + def setup(self, axis): + np.random.seed(1234) + self.df = DataFrame(np.random.randn(10000, 1000)) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df_mixed = self.df.copy() + self.df_mixed['foo'] = 'bar' - def time_count_level_axis1_multi(self): - self.df_mi.count(axis=1, level=1) + self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index]) + self.df.columns = MultiIndex.from_arrays([self.df.columns, + self.df.columns]) + self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index, + self.df_mixed.index]) + self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns, + self.df_mixed.columns]) - def time_count_level_axis0_mixed_dtypes_multi(self): - self.df_mixed_mi.count(axis=0, level=1) + def time_count_level_multi(self, axis): + self.df.count(axis=axis, level=1) - def time_count_level_axis1_mixed_dtypes_multi(self): - self.df_mixed_mi.count(axis=1, level=1) + def time_count_level_mixed_dtypes_multi(self, axis): + self.df_mixed.count(axis=axis, level=1) class Apply(object): + goal_time = 0.2 def setup(self): + np.random.seed(1234) self.df = DataFrame(np.random.randn(1000, 100)) self.s = Series(np.arange(1028.0)) self.df2 = DataFrame({i: self.s for i in range(1028)}) - self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) def time_apply_user_func(self): - self.df2.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) + self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)]) def time_apply_axis_1(self): - self.df.apply((lambda x: (x + 1)), axis=1) + self.df.apply(lambda x: x + 1, axis=1) def time_apply_lambda_mean(self): - self.df.apply((lambda x: x.mean())) + self.df.apply(lambda x: x.mean()) def time_apply_np_mean(self): self.df.apply(np.mean) def time_apply_pass_thru(self): - self.df.apply((lambda x: x)) + self.df.apply(lambda x: x) def time_apply_ref_by_name(self): - self.df3.apply((lambda x: (x['A'] + x['B'])), axis=1) + self.df3.apply(lambda x: x['A'] + x['B'], axis=1) -#---------------------------------------------------------------------- -# dtypes +class Dtypes(object): -class frame_dtypes(object): goal_time = 0.2 def setup(self): + np.random.seed(1234) self.df = DataFrame(np.random.randn(1000, 1000)) def time_frame_dtypes(self): self.df.dtypes -#---------------------------------------------------------------------- -# equals class Equals(object): + goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in ( - ('float_df', self.float_df), ('object_df', self.object_df), - ('nonunique_cols', self.nonunique_cols))]) - - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + N = 10**3 + np.random.seed(1234) + self.float_df = DataFrame(np.random.randn(N, N)) + self.float_df_nan = self.float_df.copy() + self.float_df_nan.iloc[-1, -1] = np.nan - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + self.object_df = DataFrame('foo', index=range(N), columns=range(N)) + self.object_df_nan = self.object_df.copy() + self.object_df_nan.iloc[-1, -1] = np.nan - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns) + self.nonunique_cols_nan = self.nonunique_cols.copy() + self.nonunique_cols_nan.iloc[-1, -1] = np.nan def time_frame_float_equal(self): - self.test_equal('float_df') + self.float_df.equals(self.float_df) def time_frame_float_unequal(self): - self.test_unequal('float_df') + self.float_df.equals(self.float_df_nan) def time_frame_nonunique_equal(self): - self.test_equal('nonunique_cols') + self.nonunique_cols.equals(self.nonunique_cols) def time_frame_nonunique_unequal(self): - self.test_unequal('nonunique_cols') + self.nonunique_cols.equals(self.nonunique_cols_nan) def time_frame_object_equal(self): - self.test_equal('object_df') + self.object_df.equals(self.object_df) def time_frame_object_unequal(self): - self.test_unequal('object_df') + self.object_df.equals(self.object_df_nan) class Interpolate(object): + goal_time = 0.2 + params = [None, 'infer'] + param_names = ['downcast'] - def setup(self): + def setup(self, downcast): + N = 10000 + np.random.seed(1234) # this is the worst case, where every column has NaNs. - self.df = DataFrame(randn(10000, 100)) + self.df = DataFrame(np.random.randn(N, 100)) self.df.values[::2] = np.nan - self.df2 = DataFrame( - {'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), - 'C': randn(10000), 'D': randn(10000),}) + self.df2 = DataFrame({'A': np.arange(0, N), + 'B': np.random.randint(0, 100, N), + 'C': np.random.randn(N), + 'D': np.random.randn(N)}) self.df2.loc[1::5, 'A'] = np.nan self.df2.loc[1::5, 'C'] = np.nan - def time_interpolate(self): - self.df.interpolate() - - def time_interpolate_some_good(self): - self.df2.interpolate() + def time_interpolate(self, downcast): + self.df.interpolate(downcast=downcast) - def time_interpolate_some_good_infer(self): - self.df2.interpolate(downcast='infer') + def time_interpolate_some_good(self, downcast): + self.df2.interpolate(downcast=downcast) class Shift(object): # frame shift speedup issue-5609 goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): + def setup(self, axis): + np.random.seed(1234) self.df = DataFrame(np.random.rand(10000, 500)) - def time_shift_axis0(self): - self.df.shift(1, axis=0) - - def time_shift_axis_1(self): - self.df.shift(1, axis=1) + def time_shift(self, axis): + self.df.shift(1, axis=axis) -#----------------------------------------------------------------------------- -# from_records issue-6700 +class FromRecords(object): -class frame_from_records_generator(object): goal_time = 0.2 + params = [None, 1000] + param_names = ['nrows'] - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) + def setup(self, nrows): + N = 100000 + self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) - def time_frame_from_records_generator(self): - self.df = DataFrame.from_records(self.get_data()) + def time_frame_from_records_generator(self, nrows): + # issue-6700 + self.df = DataFrame.from_records(self.gen, nrows=nrows) - def time_frame_from_records_generator_nrows(self): - self.df = DataFrame.from_records(self.get_data(), nrows=1000) - - -#----------------------------------------------------------------------------- -# nunique - -class frame_nunique(object): +class Nunique(object): def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) + np.random.seed(1234) + self.df = DataFrame(np.random.randn(10000, 1000)) def time_frame_nunique(self): self.df.nunique() +class Duplicated(object): -#----------------------------------------------------------------------------- -# duplicated - -class frame_duplicated(object): goal_time = 0.2 def setup(self): - self.n = (1 << 20) - self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) - self.xs = np.random.randn((self.n // 64)).round(2) - self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) - - self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)) + np.random.seed(1234) + n = (1 << 20) + t = date_range('2015-01-01', freq='S', periods=(n // 64)) + xs = np.random.randn(n // 64).round(2) + self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), + 'b': np.random.choice(t, n), + 'c': np.random.choice(xs, n)}) + self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T def time_frame_duplicated(self): self.df.duplicated() def time_frame_duplicated_wide(self): - self.df2.T.duplicated() - - - - - - - - - - - - - + self.df2.duplicated() +class XS(object): - -class frame_xs_col(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(randn(1, 100000)) - - def time_frame_xs_col(self): - self.df.xs(50000, axis=1) - - -class frame_xs_row(object): - goal_time = 0.2 + def setup(self, axis): + np.random.seed(1234) + self.N = 10**4 + self.df = DataFrame(np.random.randn(self.N, self.N)) - def setup(self): - self.df = DataFrame(randn(100000, 1)) + def time_frame_xs(self, axis): + self.df.xs(self.N / 2, axis=axis) - def time_frame_xs_row(self): - self.df.xs(50000) +class SortIndex(object): -class frame_sort_index(object): goal_time = 0.2 + params = [True, False] + param_names = ['ascending'] - def setup(self): - self.df = DataFrame(randn(1000000, 2), columns=list('AB')) + def setup(self, ascending): + np.random.seed(1234) + self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) - def time_frame_sort_index(self): - self.df.sort_index() + def time_frame_sort_index(self, ascending): + self.df.sort_index(ascending=ascending) -class frame_sort_index_by_columns(object): +class SortIndexByColumns(object): + goal_time = 0.2 def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) + np.random.seed(1234) + N = 10000 + K = 10 + self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), + 'key2': tm.makeStringIndex(N).values.repeat(K), + 'value': np.random.randn(N * K)}) def time_frame_sort_index_by_columns(self): self.df.sort_index(by=['key1', 'key2']) -class frame_quantile_axis1(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) - - def time_frame_quantile_axis1(self): - self.df.quantile([0.1, 0.5], axis=1) - - -#---------------------------------------------------------------------- -# boolean indexing - -class frame_boolean_row_select(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.bool_arr = np.zeros(10000, dtype=bool) - self.bool_arr[:1000] = True - - def time_frame_boolean_row_select(self): - self.df[self.bool_arr] - -class frame_getitem_single_column(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] - - def time_frame_getitem_single_column(self): - self.h() - - def time_frame_getitem_single_column2(self): - self.j() - - -#---------------------------------------------------------------------- -# assignment - -class frame_assign_timeseries_index(object): - goal_time = 0.2 - - def setup(self): - self.idx = date_range('1/1/2000', periods=100000, freq='H') - self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - - def time_frame_assign_timeseries_index(self): - self.f(self.df) - - def f(self, df): - self.x = self.df.copy() - self.x['date'] = self.x.index - +class Quantile(object): - -# insert many columns - -class frame_insert_100_columns_begin(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000 - - def f(self, K=100): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df.insert(0, i, self.new_col) - - def g(self, K=500): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df[i] = self.new_col - - def time_frame_insert_100_columns_begin(self): - self.f() - - def time_frame_insert_500_columns_end(self): - self.g() - - - -#---------------------------------------------------------------------- -# strings methods, #2602 - -class series_string_vector_slice(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.s = Series((['abcdefg', np.nan] * 500000)) - - def time_series_string_vector_slice(self): - self.s.str[:5] + def setup(self, axis): + np.random.seed(1234) + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + def time_frame_quantile(self, axis): + self.df.quantile([0.1, 0.5], axis=axis) -#---------------------------------------------------------------------- -# df.info() and get_dtype_counts() # 2807 -class frame_get_dtype_counts(object): +class GetDtypeCounts(object): + # 2807 goal_time = 0.2 def setup(self): + np.random.seed(1234) self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() + def time_info(self): + self.df.info() + + +class Nlargest(object): -class frame_nlargest(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) + np.random.seed(1234) + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) def time_frame_nlargest(self): self.df.nlargest(100, 'A') diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index f3e7ebbbd33e8..f271b82c758ee 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -303,3 +303,69 @@ def time_lookup_ix(self): def time_lookup_loc(self): self.s.loc + + +class BooleanRowSelect(object): + + goal_time = 0.2 + + def setup(self): + N = 10000 + np.random.seed(1234) + self.df = DataFrame(np.random.randn(N, 100)) + self.bool_arr = np.zeros(N, dtype=bool) + self.bool_arr[:1000] = True + + def time_frame_boolean_row_select(self): + self.df[self.bool_arr] + + +class GetItemSingleColumn(object): + + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.df2 = DataFrame(np.random.randn(3000, 1), columns=['A']) + self.df3 = DataFrame(np.random.randn(3000, 1)) + + def time_frame_getitem_single_column_label(self): + self.df2['A'] + + def time_frame_getitem_single_column_int(self): + self.df3[0] + + +class AssignTimeseriesIndex(object): + + goal_time = 0.2 + + def setup(self): + N = 100000 + np.random.seed(1234) + dx = date_range('1/1/2000', periods=N, freq='H') + self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) + + def time_frame_assign_timeseries_index(self): + self.df['date'] = self.df.index + + +class InsertColumns(object): + + goal_time = 0.2 + + def setup(self): + self.N = 10**3 + self.df = DataFrame(index=range(N)) + + def time_insert(self): + np.random.seed(1234) + for i in range(100): + self.df.insert(0, i, np.random.randn(self.N)) + + def time_assign_with_setitem(self): + np.random.seed(1234) + for i in range(100): + self.df[i] = np.random.randn(self.N) + + diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index c1600d4e07f58..0c36c85413c76 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -105,3 +105,15 @@ def setup(self): def time_encode_decode(self): self.ser.str.encode('utf-8').str.decode('utf-8') + + +class StringSlice(object): + + goal_time = 0.2 + + def setup(self): + self.s = Series(['abcdefg', np.nan] * 500000) + + def time_series_string_vector_slice(self): + # GH 2602 + self.s.str[:5] \ No newline at end of file From 6b581f3a74071909791c9307c30eed3594134a6d Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 27 Nov 2017 17:07:08 -0800 Subject: [PATCH 2/3] Add blank line at the end of string.py --- asv_bench/benchmarks/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 0c36c85413c76..948d4b92a5a57 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -116,4 +116,4 @@ def setup(self): def time_series_string_vector_slice(self): # GH 2602 - self.s.str[:5] \ No newline at end of file + self.s.str[:5] From 12e4686f2925eb9f1168f2a8939a9d5562d7515e Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 28 Nov 2017 22:21:45 -0800 Subject: [PATCH 3/3] Address comments --- asv_bench/benchmarks/frame_ctor.py | 15 ++++++ asv_bench/benchmarks/frame_methods.py | 67 +++++++----------------- asv_bench/benchmarks/pandas_vb_common.py | 6 ++- 3 files changed, 39 insertions(+), 49 deletions(-) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 5fad7b682c2ed..d577ebc20a31c 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -124,3 +124,18 @@ def setup(self, offset, n_steps): def time_frame_ctor(self, offset, n_steps): DataFrame(self.d) + + +class FromRecords(object): + + goal_time = 0.2 + params = [None, 1000] + param_names = ['nrows'] + + def setup(self, nrows): + N = 100000 + self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) + + def time_frame_from_records_generator(self, nrows): + # issue-6700 + self.df = DataFrame.from_records(self.gen, nrows=nrows) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 9a6221b9fd6d2..7ed341425e561 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -3,6 +3,7 @@ import pandas.util.testing as tm from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, isnull, NaT) +from .pandas_vb_common import setup class GetNumericData(object): @@ -10,7 +11,6 @@ class GetNumericData(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' @@ -25,7 +25,6 @@ class Lookup(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) self.df['foo'] = 'bar' @@ -49,7 +48,6 @@ class Reindex(object): def setup(self): N = 10**3 - np.random.seed(1234) self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.df2 = DataFrame( @@ -72,7 +70,6 @@ def time_reindex_both_axes_ix(self): self.df.ix[self.idx, self.idx] def time_reindex_upcast(self): - np.random.seed(1234) self.df2.reindex(np.random.permutation(range(1200))) @@ -82,7 +79,6 @@ class Iteration(object): def setup(self): N = 1000 - np.random.seed(1234) self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), @@ -107,13 +103,16 @@ def time_itertuples(self): for row in self.df2.itertuples(): pass + def time_iterrows(self): + for row in self.df.iterrows(): + pass + class ToString(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) self.df = DataFrame(np.random.randn(100, 10)) def time_to_string_floats(self): @@ -166,7 +165,6 @@ class MaskBool(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) data = np.random.randn(1000, 500) df = DataFrame(data) df = df.where(df > 0) @@ -186,7 +184,6 @@ class Isnull(object): def setup(self): N = 10**3 - np.random.seed(1234) self.df_no_null = DataFrame(np.random.randn(N, N)) sample = np.array([np.nan, 1.0]) @@ -222,7 +219,6 @@ class Fillna(object): param_names = ['inplace', 'method'] def setup(self, inplace, method): - np.random.seed(1234) values = np.random.randn(10000, 100) values[::2] = np.nan self.df = DataFrame(values) @@ -238,7 +234,6 @@ class Dropna(object): param_names = ['how', 'axis'] def setup(self, how, axis): - np.random.seed(1234) self.df = DataFrame(np.random.randn(10000, 1000)) self.df.ix[50:1000, 20:50] = np.nan self.df.ix[2000:3000] = np.nan @@ -261,7 +256,6 @@ class Count(object): param_names = ['axis'] def setup(self, axis): - np.random.seed(1234) self.df = DataFrame(np.random.randn(10000, 1000)) self.df.ix[50:1000, 20:50] = np.nan self.df.ix[2000:3000] = np.nan @@ -289,7 +283,6 @@ class Apply(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) self.df = DataFrame(np.random.randn(1000, 100)) self.s = Series(np.arange(1028.0)) @@ -320,7 +313,6 @@ class Dtypes(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) self.df = DataFrame(np.random.randn(1000, 1000)) def time_frame_dtypes(self): @@ -333,7 +325,6 @@ class Equals(object): def setup(self): N = 10**3 - np.random.seed(1234) self.float_df = DataFrame(np.random.randn(N, N)) self.float_df_nan = self.float_df.copy() self.float_df_nan.iloc[-1, -1] = np.nan @@ -374,7 +365,6 @@ class Interpolate(object): def setup(self, downcast): N = 10000 - np.random.seed(1234) # this is the worst case, where every column has NaNs. self.df = DataFrame(np.random.randn(N, 100)) self.df.values[::2] = np.nan @@ -400,32 +390,15 @@ class Shift(object): param_names = ['axis'] def setup(self, axis): - np.random.seed(1234) self.df = DataFrame(np.random.rand(10000, 500)) def time_shift(self, axis): self.df.shift(1, axis=axis) -class FromRecords(object): - - goal_time = 0.2 - params = [None, 1000] - param_names = ['nrows'] - - def setup(self, nrows): - N = 100000 - self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) - - def time_frame_from_records_generator(self, nrows): - # issue-6700 - self.df = DataFrame.from_records(self.gen, nrows=nrows) - - class Nunique(object): def setup(self): - np.random.seed(1234) self.df = DataFrame(np.random.randn(10000, 1000)) def time_frame_nunique(self): @@ -437,7 +410,6 @@ class Duplicated(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) n = (1 << 20) t = date_range('2015-01-01', freq='S', periods=(n // 64)) xs = np.random.randn(n // 64).round(2) @@ -460,7 +432,6 @@ class XS(object): param_names = ['axis'] def setup(self, axis): - np.random.seed(1234) self.N = 10**4 self.df = DataFrame(np.random.randn(self.N, self.N)) @@ -468,18 +439,17 @@ def time_frame_xs(self, axis): self.df.xs(self.N / 2, axis=axis) -class SortIndex(object): +class SortValues(object): goal_time = 0.2 params = [True, False] param_names = ['ascending'] def setup(self, ascending): - np.random.seed(1234) self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) - def time_frame_sort_index(self, ascending): - self.df.sort_index(ascending=ascending) + def time_frame_sort_values(self, ascending): + self.df.sort_values(by='A', ascending=ascending) class SortIndexByColumns(object): @@ -487,15 +457,14 @@ class SortIndexByColumns(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) N = 10000 K = 10 self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), 'key2': tm.makeStringIndex(N).values.repeat(K), 'value': np.random.randn(N * K)}) - def time_frame_sort_index_by_columns(self): - self.df.sort_index(by=['key1', 'key2']) + def time_frame_sort_values_by_columns(self): + self.df.sort_values(by=['key1', 'key2']) class Quantile(object): @@ -505,7 +474,6 @@ class Quantile(object): param_names = ['axis'] def setup(self, axis): - np.random.seed(1234) self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) def time_frame_quantile(self, axis): @@ -517,7 +485,6 @@ class GetDtypeCounts(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): @@ -527,13 +494,17 @@ def time_info(self): self.df.info() -class Nlargest(object): +class NSort(object): goal_time = 0.2 + params = ['first', 'last'] + param_names = ['keep'] - def setup(self): - np.random.seed(1234) + def setup(self, keep): self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) - def time_frame_nlargest(self): - self.df.nlargest(100, 'A') + def time_nlargest(self, keep): + self.df.nlargest(100, 'A', keep=keep) + + def time_nsmallest(self, keep): + self.df.nsmallest(100, 'A', keep=keep) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index b1a58e49fe86c..62eb826418030 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -13,7 +13,11 @@ except ImportError: pass -np.random.seed(1234) +# This function just needs to be imported into each benchmark file in order to +# sets up the random seed before each function. +# http://asv.readthedocs.io/en/latest/writing_benchmarks.html +def setup(*args, **kwargs): + np.random.seed(1234) # try em until it works! for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']: