From ed949ae05990694d800c02eaffbae34d3bf0f2a8 Mon Sep 17 00:00:00 2001 From: changhiskhan Date: Wed, 19 Dec 2018 16:07:39 -0800 Subject: [PATCH] [ENH] Add DataFrame method to explode a list-like column (GH #16538) Sometimes a values column is presented with list-like values on one row. Instead we may want to split each individual value onto its own row, keeping the same mapping to the other key columns. While it's possible to chain together existing pandas operations (in fact that's exactly what this implementation is) to do this, the sequence of operations is not obvious. By contrast this is available as a built-in operation in say Spark and is a fairly common use case. --- asv_bench/benchmarks/reshape.py | 18 ++++++ doc/source/user_guide/reshaping.rst | 31 ++++++++++ doc/source/whatsnew/v0.24.0.rst | 30 +++++++++ pandas/core/frame.py | 51 ++++++++++++++++ pandas/tests/frame/test_reshape.py | 95 +++++++++++++++++++++++++++++ 5 files changed, 225 insertions(+) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index f41e13163b3f5..25d4c106c0e20 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -240,4 +240,22 @@ def time_qcut_datetime(self, bins): pd.qcut(self.datetime_series, bins) +class Explode(object): + param_names = ['n_rows', 'max_list_length'] + params = [[100, 1000, 10000], [3, 5, 10]] + + def setup(self, n_rows, max_list_length): + import string + num_letters = np.random.randint(0, max_list_length, n_rows) + key_column = [','.join([np.random.choice(list(string.ascii_letters)) + for _ in range(k)]) + for k in num_letters] + value_column = np.random.randn(n_rows) + self.frame = pd.DataFrame({'key': key_column, + 'value': value_column}) + + def time_explode(self, n_rows, max_list_length): + self.frame.explode('key', sep=',') + + from .pandas_vb_common import setup # noqa: F401 diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b7b6dd0a69c24..699b6374ca35a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -801,3 +801,34 @@ Note to subdivide over multiple columns we can pass in a list to the df.pivot_table( values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + +.. _reshaping.explode: + +Exploding a List-like Column +---------------------------- + +Sometimes the value column is list-like: + +.. ipython:: python + + keys = ['panda1', 'panda2', 'panda3'] + values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']] + df = pd.DataFrame({'keys': keys, 'values': values}) + df + +But we actually want to put each value onto its own row. +For this purpose we can use ``DataFrame.explode``: + +.. ipython:: python + + df.explode('values') + +For convenience, we can use the optional keyword ``sep`` to automatically +split a string column before exploding: + +.. ipython:: python + + values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves'] + df2 = pd.DataFrame({'keys': keys, 'values': values}) + df2 + df2.explode('values', sep=',') diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a66056f661de3..249fc72596dda 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -15,7 +15,37 @@ This is a major release from 0.23.4 and includes a number of API changes, new features, enhancements, and performance improvements along with a large number of bug fixes. +<<<<<<< HEAD Highlights include: +======= +These are the changes in pandas 0.24.0. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0240.enhancements: + +New features +~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups ` for more information (:issue:`15475`, :issue:`15506`). +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing + the user to override the engine's default behavior to include or omit the + dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) +- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) +- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) +- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. + See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) +- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) + +.. _whatsnew_0240.values_api: + +Accessing the values in a Series or Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a +``Series`` or ``Index``. (:issue:`19954`, :issue:`23623`) +>>>>>>> 2138ef063... [ENH] Add DataFrame method to explode a list-like column (GH #16538) * :ref:`Optional Integer NA Support ` * :ref:`New APIs for accessing the array backing a Series or Index ` diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f45a13249b16c..79566cf9f583c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6423,6 +6423,57 @@ def melt( col_level=col_level, ) + def explode(self, col_name, sep=None, dtype=None): + """ + Create new DataFrame expanding a list-like column. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + col_name : str + Name of the column to be exploded. + sep : str, default None + Convenience to split a string `col_name` before exploding. + dtype : str or dtype, default None + Optionally coerce the dtype of exploded column. + + Returns + ------- + exploded: DataFrame + + See Also + -------- + Series.str.split: Split string values on specified separator. + Series.str.extract: Extract groups from the first regex match. + + Examples + -------- + >>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]}) + >>> df.explode('k', sep=',') + k v + 0 a 0 + 0 b 0 + 1 c 1 + 1 d 1 + """ + col = self[col_name] + if len(self) == 0: + return self.copy() + if sep: + col_expanded = col.str.split(sep, expand=True) + else: + col_expanded = col.apply(Series) + col_stacked = (col_expanded + .stack() + .reset_index(level=-1, drop=True) + .rename(col_name)) + if dtype: + col_stacked = col_stacked.astype(dtype) + return (col_stacked.to_frame() + .join(self.drop(col_name, axis=1)) + .reindex(self.columns, axis=1)) + # ---------------------------------------------------------------------- # Time series-related diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index f3452e9a85fb3..363191b92c78a 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -1043,6 +1043,101 @@ def test_unstack_swaplevel_sortlevel(self, level): tm.assert_frame_equal(result, expected) +class TestDataFrameExplode(object): + # GH 16538 + columns = ['a', 'b', 'c'] + + def test_sep(self): + # Automatically do str.split + df = pd.DataFrame([['foo,bar', 'x', 42], + ['fizz,buzz', 'y', 43]], + columns=self.columns) + rs = df.explode('a', sep=',') + xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], + 'b': ['x', 'x', 'y', 'y'], + 'c': [42, 42, 43, 43]}, + index=[0, 0, 1, 1]) + tm.assert_frame_equal(rs, xp) + + def test_dtype(self): + # Coerce dtype + df = pd.DataFrame([[[0, 1, 4], 'x', 42], + [[2, 3], 'y', 43]], + columns=self.columns) + rs = df.explode('a', dtype='int') + xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'), + 'b': ['x', 'x', 'x', 'y', 'y'], + 'c': [42, 42, 42, 43, 43]}, + index=[0, 0, 0, 1, 1]) + tm.assert_frame_equal(rs, xp) + + def test_na(self): + # NaN's and empty lists are omitted + # TODO: option to preserve explicit NAs instead + df = pd.DataFrame([[[], 'x', 42], + [[2.0, np.nan], 'y', 43]], + columns=self.columns) + rs = df.explode('a') + xp = pd.DataFrame({'a': [2.0], + 'b': ['y'], + 'c': [43]}, + index=[1]) + tm.assert_frame_equal(rs, xp) + + def test_nonuniform_type(self): + # Not everything is a list + df = pd.DataFrame([[[0, 1, 4], 'x', 42], + [3, 'y', 43]], + columns=self.columns) + rs = df.explode('a', dtype='int') + xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'), + 'b': ['x', 'x', 'x', 'y'], + 'c': [42, 42, 42, 43]}, + index=[0, 0, 0, 1]) + tm.assert_frame_equal(rs, xp) + + def test_all_scalars(self): + # Nothing is a list + df = pd.DataFrame([[0, 'x', 42], + [3, 'y', 43]], + columns=self.columns) + rs = df.explode('a') + xp = pd.DataFrame({'a': [0, 3], + 'b': ['x', 'y'], + 'c': [42, 43]}, + index=[0, 1]) + tm.assert_frame_equal(rs, xp) + + def test_empty(self): + # Empty frame + rs = pd.DataFrame(columns=['a', 'b']).explode('a') + xp = pd.DataFrame(columns=['a', 'b']) + tm.assert_frame_equal(rs, xp) + + def test_missing_column(self): + # Bad column name + df = pd.DataFrame([[0, 'x', 42], + [3, 'y', 43]], + columns=self.columns) + pytest.raises(KeyError, df.explode, 'badcolumnname') + + def test_multi_index(self): + # Multi-index + idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')]) + df = pd.DataFrame([['foo,bar', 'x', 42], + ['fizz,buzz', 'y', 43]], + columns=self.columns, + index=idx) + rs = df.explode('a', sep=',') + idx = pd.MultiIndex.from_tuples( + [(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')]) + xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], + 'b': ['x', 'x', 'y', 'y'], + 'c': [42, 42, 43, 43]}, + index=idx) + tm.assert_frame_equal(rs, xp) + + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = pd.Series(["a", "b", "c", "a"], dtype="object")