diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 678403d837805..eee81fea20501 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -228,4 +228,22 @@ def time_qcut_datetime(self, bins): pd.qcut(self.datetime_series, bins) +class Explode(object): + param_names = ['n_rows', 'max_list_length'] + params = [[100, 1000, 10000], [3, 5, 10]] + + def setup(self, n_rows, max_list_length): + import string + num_letters = np.random.randint(0, max_list_length, n_rows) + key_column = [','.join([np.random.choice(list(string.ascii_letters)) + for _ in range(k)]) + for k in num_letters] + value_column = np.random.randn(n_rows) + self.frame = pd.DataFrame({'key': key_column, + 'value': value_column}) + + def time_explode(self, n_rows, max_list_length): + self.frame.explode('key', sep=',') + + from .pandas_vb_common import setup # noqa: F401 diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 8ad78a68977ad..65f0b8a7fec28 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -801,3 +801,34 @@ Note to subdivide over multiple columns we can pass in a list to the df.pivot_table( values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + +.. _reshaping.explode: + +Exploding a List-like Column +---------------------------- + +Sometimes the value column is list-like: + +.. ipython:: python + + keys = ['panda1', 'panda2', 'panda3'] + values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']] + df = pd.DataFrame({'keys': keys, 'values': values}) + df + +But we actually want to put each value onto its own row. +For this purpose we can use ``DataFrame.explode``: + +.. ipython:: python + + df.explode('values') + +For convenience, we can use the optional keyword ``sep`` to automatically +split a string column before exploding: + +.. ipython:: python + + values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves'] + df2 = pd.DataFrame({'keys': keys, 'values': values}) + df2 + df2.explode('values', sep=',') diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 086519ad75192..0b5a8b9e9e7e3 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -15,7 +15,51 @@ This is a major release from 0.23.4 and includes a number of API changes, new features, enhancements, and performance improvements along with a large number of bug fixes. +<<<<<<< HEAD +These are the changes in pandas 0.24.0. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0240.enhancements: + +New features +~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups ` for more information (:issue:`15475`, :issue:`15506`). +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing + the user to override the engine's default behavior to include or omit the + dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) +- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) +- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) +- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. + See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) +- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`) + +.. _whatsnew_0240.values_api: + +Accessing the values in a Series or Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a +``Series`` or ``Index``. (:issue:`19954`, :issue:`23623`) + +.. ipython:: python + + idx = pd.period_range('2000', periods=4) + idx.array + pd.Series(idx).array + +Historically, this would have been done with ``series.values``, but with +``.values`` it was unclear whether the returned value would be the actual array, +some transformation of it, or one of pandas custom arrays (like +``Categorical``). For example, with :class:`PeriodIndex`, ``.values`` generates +a new ndarray of period objects each time. + +.. ipython:: python +======= Highlights include: +>>>>>>> master * :ref:`Optional Integer NA Support ` * :ref:`New APIs for accessing the array backing a Series or Index ` diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fd2e1e3e41ced..7839f9483ee1e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6165,6 +6165,57 @@ def melt(self, id_vars=None, value_vars=None, var_name=None, var_name=var_name, value_name=value_name, col_level=col_level) + def explode(self, col_name, sep=None, dtype=None): + """ + Create new DataFrame expanding a list-like column. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + col_name : str + Name of the column to be exploded. + sep : str, default None + Convenience to split a string `col_name` before exploding. + dtype : str or dtype, default None + Optionally coerce the dtype of exploded column. + + Returns + ------- + exploded: DataFrame + + See Also + -------- + Series.str.split: Split string values on specified separator. + Series.str.extract: Extract groups from the first regex match. + + Examples + -------- + >>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]}) + >>> df.explode('k', sep=',') + k v + 0 a 0 + 0 b 0 + 1 c 1 + 1 d 1 + """ + col = self[col_name] + if len(self) == 0: + return self.copy() + if sep: + col_expanded = col.str.split(sep, expand=True) + else: + col_expanded = col.apply(Series) + col_stacked = (col_expanded + .stack() + .reset_index(level=-1, drop=True) + .rename(col_name)) + if dtype: + col_stacked = col_stacked.astype(dtype) + return (col_stacked.to_frame() + .join(self.drop(col_name, axis=1)) + .reindex(self.columns, axis=1)) + # ---------------------------------------------------------------------- # Time series-related diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index a3b9e529431e5..a2eb4ce01408a 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -900,6 +900,101 @@ def test_unstack_swaplevel_sortlevel(self, level): tm.assert_frame_equal(result, expected) +class TestDataFrameExplode(object): + # GH 16538 + columns = ['a', 'b', 'c'] + + def test_sep(self): + # Automatically do str.split + df = pd.DataFrame([['foo,bar', 'x', 42], + ['fizz,buzz', 'y', 43]], + columns=self.columns) + rs = df.explode('a', sep=',') + xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], + 'b': ['x', 'x', 'y', 'y'], + 'c': [42, 42, 43, 43]}, + index=[0, 0, 1, 1]) + tm.assert_frame_equal(rs, xp) + + def test_dtype(self): + # Coerce dtype + df = pd.DataFrame([[[0, 1, 4], 'x', 42], + [[2, 3], 'y', 43]], + columns=self.columns) + rs = df.explode('a', dtype='int') + xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'), + 'b': ['x', 'x', 'x', 'y', 'y'], + 'c': [42, 42, 42, 43, 43]}, + index=[0, 0, 0, 1, 1]) + tm.assert_frame_equal(rs, xp) + + def test_na(self): + # NaN's and empty lists are omitted + # TODO: option to preserve explicit NAs instead + df = pd.DataFrame([[[], 'x', 42], + [[2.0, np.nan], 'y', 43]], + columns=self.columns) + rs = df.explode('a') + xp = pd.DataFrame({'a': [2.0], + 'b': ['y'], + 'c': [43]}, + index=[1]) + tm.assert_frame_equal(rs, xp) + + def test_nonuniform_type(self): + # Not everything is a list + df = pd.DataFrame([[[0, 1, 4], 'x', 42], + [3, 'y', 43]], + columns=self.columns) + rs = df.explode('a', dtype='int') + xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'), + 'b': ['x', 'x', 'x', 'y'], + 'c': [42, 42, 42, 43]}, + index=[0, 0, 0, 1]) + tm.assert_frame_equal(rs, xp) + + def test_all_scalars(self): + # Nothing is a list + df = pd.DataFrame([[0, 'x', 42], + [3, 'y', 43]], + columns=self.columns) + rs = df.explode('a') + xp = pd.DataFrame({'a': [0, 3], + 'b': ['x', 'y'], + 'c': [42, 43]}, + index=[0, 1]) + tm.assert_frame_equal(rs, xp) + + def test_empty(self): + # Empty frame + rs = pd.DataFrame(columns=['a', 'b']).explode('a') + xp = pd.DataFrame(columns=['a', 'b']) + tm.assert_frame_equal(rs, xp) + + def test_missing_column(self): + # Bad column name + df = pd.DataFrame([[0, 'x', 42], + [3, 'y', 43]], + columns=self.columns) + pytest.raises(KeyError, df.explode, 'badcolumnname') + + def test_multi_index(self): + # Multi-index + idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')]) + df = pd.DataFrame([['foo,bar', 'x', 42], + ['fizz,buzz', 'y', 43]], + columns=self.columns, + index=idx) + rs = df.explode('a', sep=',') + idx = pd.MultiIndex.from_tuples( + [(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')]) + xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'], + 'b': ['x', 'x', 'y', 'y'], + 'c': [42, 42, 43, 43]}, + index=idx) + tm.assert_frame_equal(rs, xp) + + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')