diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index e5c2f54263a3cb..d0fdc23d8baf99 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -184,4 +184,22 @@ def time_qcut_datetime(self, bins): pd.qcut(self.datetime_series, bins) +class Explode(object): + param_names = ['n_rows', 'max_list_length'] + params = [[100, 1000, 10000], [3, 5, 10]] + + def setup(self, n_rows, max_list_length): + import string + num_letters = np.random.randint(0, max_list_length, n_rows) + key_column = [','.join([np.random.choice(list(string.ascii_letters)) + for _ in range(k)]) + for k in num_letters] + value_column = np.random.randn(n_rows) + self.frame = pd.DataFrame({'key': key_column, + 'value': value_column}) + + def time_explode(self, n_rows, max_list_length): + self.frame.explode('key', sep=',') + + from .pandas_vb_common import setup # noqa: F401 diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 9891e22e9d552a..2fc564c48f4bf8 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -801,3 +801,45 @@ Note to subdivide over multiple columns we can pass in a list to the df.pivot_table( values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + +Exploding a List-like Column +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + import pandas as pd + df = pd.DataFrame({'keys': ['panda1', 'panda2', 'panda3'] + 'values': [['eats','shoots'], + ['shoots','leaves'], + ['eats','shoots','leaves']]}) + exploded = df.explode('values') + df2 = pd.DataFrame({'keys': ['panda1', 'panda2', 'panda3'] + 'values': ['eats,shoots', + 'shoots,leaves', + 'eats,shoots,leaves']}) + +Sometimes the value column is list-like: + +.. ipython:: python + + df + +But we actually want to put each value onto its own row: + +.. ipython:: python + + exploded + +For this we can use ``DataFrame.explode``: + + df.explode('values') + +For convenience, we can use the optional keyword ``sep`` to automatically +split a string values column before exploding: + +.. ipython:: python + + df2 + + df2.explode('values', sep=',') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c4537db2541320..9d92bbfd1ee1d1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5980,6 +5980,48 @@ def melt(self, id_vars=None, value_vars=None, var_name=None, var_name=var_name, value_name=value_name, col_level=col_level) + def explode(self, col_name, sep=None, dtype=None): + """ + Create a new DataFrame where each element in each row + of a list-like column `col_name` is expanded to its own row + + .. versionadded:: 0.25.0 + + Parameters + ---------- + col_name : str + Name of the column to be exploded + sep : str, default None + Convenience to split a string `col_name` before exploding + dtype : str or dtype, default None + Optionally coerce the dtype of exploded column +- + Examples + -------- + >>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]}) + >>> df.explode('k', sep=',') + k v + 0 a 0 + 0 b 0 + 1 c 1 + 1 d 1 + """ + col = self[col_name] + if len(self) == 0: + return self.copy() + if sep: + col_expanded = col.str.split(sep, expand=True) + else: + col_expanded = col.apply(Series) + col_stacked = (col_expanded + .stack() + .reset_index(level=-1, drop=True) + .rename(col_name)) + if dtype: + col_stacked = col_stacked.astype(dtype) + return (col_stacked.to_frame() + .join(self.drop(col_name, axis=1))) + # ---------------------------------------------------------------------- # Time series-related diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index bc9a760bc9f1dc..5d8c57d87f8f16 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -918,6 +918,90 @@ def test_unstack_swaplevel_sortlevel(self, level): tm.assert_frame_equal(result, expected) +def test_explode(): + # GH 16538 + + # Automatically do str.split + columns = ['a', 'b', 'c'] + df = pd.DataFrame([['foo,bar', 'x', 42], + ['fizz,buzz', 'y', 43]], + columns=columns) + rs = df.explode('a', sep=',') + xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], + 'b': ['x', 'x', 'y', 'y'], + 'c': [42, 42, 43, 43]}, + index=[0, 0, 1, 1]) + tm.assert_frame_equal(rs, xp) + + # Coerce dtype + df = pd.DataFrame([[[0, 1, 4], 'x', 42], + [[2, 3], 'y', 43]], + columns=columns) + rs = df.explode('a', dtype='int') + xp = pd.DataFrame({'a': [0, 1, 4, 2, 3], + 'b': ['x', 'x', 'x', 'y', 'y'], + 'c': [42, 42, 42, 43, 43]}, + index=[0, 0, 0, 1, 1]) + tm.assert_frame_equal(rs, xp) + + # NaN's and empty lists are omitted + # TODO: option to preserve explicit NAs instead + df = pd.DataFrame([[[], 'x', 42], + [[2.0, np.nan], 'y', 43]], + columns=columns) + rs = df.explode('a') + xp = pd.DataFrame({'a': [2.0], + 'b': ['y'], + 'c': [43]}, + index=[1]) + tm.assert_frame_equal(rs, xp) + + # Not everything is a list + df = pd.DataFrame([[[0, 1, 4], 'x', 42], + [3, 'y', 43]], + columns=columns) + rs = df.explode('a', dtype='int') + xp = pd.DataFrame({'a': [0, 1, 4, 3], + 'b': ['x', 'x', 'x', 'y'], + 'c': [42, 42, 42, 43]}, + index=[0, 0, 0, 1]) + tm.assert_frame_equal(rs, xp) + + # Nothing is a list + df = pd.DataFrame([[0, 'x', 42], + [3, 'y', 43]], + columns=columns) + rs = df.explode('a') + xp = pd.DataFrame({'a': [0, 3], + 'b': ['x', 'y'], + 'c': [42, 43]}, + index=[0, 1]) + tm.assert_frame_equal(rs, xp) + + # Empty frame + rs = pd.DataFrame(columns=['a', 'b']).explode('a') + xp = pd.DataFrame(columns=['a', 'b']) + tm.assert_frame_equal(rs, xp) + + # Bad column name + pytest.raises(KeyError, df.explode, 'badcolumnname') + + # Multi-index + columns = ['a', 'b', 'c'] + idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')]) + df = pd.DataFrame([['foo,bar', 'x', 42], + ['fizz,buzz', 'y', 43]], + columns=columns, + index=idx) + rs = df.explode('a', sep=',') + idx = pd.MultiIndex.from_tuples([(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')]) + xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], + 'b': ['x', 'x', 'y', 'y'], + 'c': [42, 42, 43, 43]}, + index=idx) + tm.assert_frame_equal(rs, xp) + + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')