[ENH] Add DataFrame method to explode a list-like column (GH pandas-d…

…ev#16538) Sometimes a values column is presented with list-like values on one row. Instead we may want to split each individual value onto its own row, keeping the same mapping to the other key columns. While it's possible to chain together existing pandas operations (in fact that's exactly what this implementation is) to do this, the sequence of operations is not obvious. By contrast this is available as a built-in operation in say Spark and is a fairly common use case.
changhiskhan · Dec 20, 2018 · 2c6f058 · 2c6f058
1 parent 14c33b0
commit 2c6f058
Show file tree

Hide file tree

Showing 4 changed files with 186 additions and 0 deletions.
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -184,4 +184,22 @@ def time_qcut_datetime(self, bins):
         pd.qcut(self.datetime_series, bins)
 
 
+class Explode(object):
+    param_names = ['n_rows', 'max_list_length']
+    params = [[100, 1000, 10000], [3, 5, 10]]
+
+    def setup(self, n_rows, max_list_length):
+        import string
+        num_letters = np.random.randint(0, max_list_length, n_rows)
+        key_column = [','.join([np.random.choice(list(string.ascii_letters))
+                                for _ in range(k)])
+                      for k in num_letters]
+        value_column = np.random.randn(n_rows)
+        self.frame = pd.DataFrame({'key': key_column,
+                                   'value': value_column})
+
+    def time_explode(self, n_rows, max_list_length):
+        self.frame.explode('key', sep=',')
+
+
 from .pandas_vb_common import setup  # noqa: F401
diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst
@@ -801,3 +801,45 @@ Note to subdivide over multiple columns we can pass in a list to the
 
    df.pivot_table(
        values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
+
+Exploding a List-like Column
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. ipython:: python
+   :suppress:
+
+   import pandas as pd
+   df = pd.DataFrame({'keys': ['panda1', 'panda2', 'panda3']
+                      'values': [['eats','shoots'],
+		                 ['shoots','leaves'],
+                                 ['eats','shoots','leaves']]})
+   exploded = df.explode('values')
+   df2 = pd.DataFrame({'keys': ['panda1', 'panda2', 'panda3']
+	               'values': ['eats,shoots',
+		                 'shoots,leaves',
+                                 'eats,shoots,leaves']})
+
+Sometimes the value column is list-like:
+
+.. ipython:: python
+
+   df
+
+But we actually want to put each value onto its own row:
+
+.. ipython:: python
+
+   exploded
+
+For this we can use ``DataFrame.explode``:
+
+   df.explode('values')
+
+For convenience, we can use the optional keyword ``sep`` to automatically
+split a string values column before exploding:
+
+.. ipython:: python
+
+   df2
+
+   df2.explode('values', sep=',')
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5980,6 +5980,48 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
                     var_name=var_name, value_name=value_name,
                     col_level=col_level)
 
+    def explode(self, col_name, sep=None, dtype=None):
+        """
+        Create a new DataFrame where each element in each row
+        of a list-like column `col_name` is expanded to its own row
+
+        .. versionadded:: 0.25.0
+
+        Parameters
+        ----------
+        col_name : str
+            Name of the column to be exploded
+        sep : str, default None
+            Convenience to split a string `col_name` before exploding
+        dtype : str or dtype, default None
+            Optionally coerce the dtype of exploded column
+-
+        Examples
+        --------
+        >>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]})
+        >>> df.explode('k', sep=',')
+           k  v
+        0  a  0
+        0  b  0
+        1  c  1
+        1  d  1
+        """
+        col = self[col_name]
+        if len(self) == 0:
+            return self.copy()
+        if sep:
+            col_expanded = col.str.split(sep, expand=True)
+        else:
+            col_expanded = col.apply(Series)
+        col_stacked = (col_expanded
+                       .stack()
+                       .reset_index(level=-1, drop=True)
+                       .rename(col_name))
+        if dtype:
+            col_stacked = col_stacked.astype(dtype)
+        return (col_stacked.to_frame()
+                .join(self.drop(col_name, axis=1)))
+
     # ----------------------------------------------------------------------
     # Time series-related
 

diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py
@@ -918,6 +918,90 @@ def test_unstack_swaplevel_sortlevel(self, level):
         tm.assert_frame_equal(result, expected)
 
 
+def test_explode():
+    # GH 16538
+
+    # Automatically do str.split
+    columns = ['a', 'b', 'c']
+    df = pd.DataFrame([['foo,bar', 'x', 42],
+                       ['fizz,buzz', 'y', 43]],
+                      columns=columns)
+    rs = df.explode('a', sep=',')
+    xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
+                       'b': ['x', 'x', 'y', 'y'],
+                       'c': [42, 42, 43, 43]},
+                      index=[0, 0, 1, 1])
+    tm.assert_frame_equal(rs, xp)
+
+    # Coerce dtype
+    df = pd.DataFrame([[[0, 1, 4], 'x', 42],
+                       [[2, 3], 'y', 43]],
+                      columns=columns)
+    rs = df.explode('a', dtype='int')
+    xp = pd.DataFrame({'a': [0, 1, 4, 2, 3],
+                       'b': ['x', 'x', 'x', 'y', 'y'],
+                       'c': [42, 42, 42, 43, 43]},
+                      index=[0, 0, 0, 1, 1])
+    tm.assert_frame_equal(rs, xp)
+
+    # NaN's and empty lists are omitted
+    # TODO: option to preserve explicit NAs instead
+    df = pd.DataFrame([[[], 'x', 42],
+                       [[2.0, np.nan], 'y', 43]],
+                      columns=columns)
+    rs = df.explode('a')
+    xp = pd.DataFrame({'a': [2.0],
+                       'b': ['y'],
+                       'c': [43]},
+                      index=[1])
+    tm.assert_frame_equal(rs, xp)
+
+    # Not everything is a list
+    df = pd.DataFrame([[[0, 1, 4], 'x', 42],
+                       [3, 'y', 43]],
+                      columns=columns)
+    rs = df.explode('a', dtype='int')
+    xp = pd.DataFrame({'a': [0, 1, 4, 3],
+                       'b': ['x', 'x', 'x', 'y'],
+                       'c': [42, 42, 42, 43]},
+                      index=[0, 0, 0, 1])
+    tm.assert_frame_equal(rs, xp)
+
+    # Nothing is a list
+    df = pd.DataFrame([[0, 'x', 42],
+                       [3, 'y', 43]],
+                      columns=columns)
+    rs = df.explode('a')
+    xp = pd.DataFrame({'a': [0, 3],
+                       'b': ['x', 'y'],
+                       'c': [42, 43]},
+                      index=[0, 1])
+    tm.assert_frame_equal(rs, xp)
+
+    # Empty frame
+    rs = pd.DataFrame(columns=['a', 'b']).explode('a')
+    xp = pd.DataFrame(columns=['a', 'b'])
+    tm.assert_frame_equal(rs, xp)
+
+    # Bad column name
+    pytest.raises(KeyError, df.explode, 'badcolumnname')
+
+    # Multi-index
+    columns = ['a', 'b', 'c']
+    idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')])
+    df = pd.DataFrame([['foo,bar', 'x', 42],
+                       ['fizz,buzz', 'y', 43]],
+                      columns=columns,
+                      index=idx)
+    rs = df.explode('a', sep=',')
+    idx = pd.MultiIndex.from_tuples([(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')])
+    xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
+                       'b': ['x', 'x', 'y', 'y'],
+                       'c': [42, 42, 43, 43]},
+                      index=idx)
+    tm.assert_frame_equal(rs, xp)
+
+
 def test_unstack_fill_frame_object():
     # GH12815 Test unstacking with object.
     data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')