COMPAT: Iteration should always yield a python scalar (pandas-dev#17491)

xref pandas-dev#10904 closes pandas-dev#13236 closes pandas-dev#13256 xref pandas-dev#14216
jreback · Sep 12, 2017 · 83436af · 83436af
1 parent e682902
commit 83436af
Show file tree

Hide file tree

Showing 11 changed files with 187 additions and 68 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -188,6 +188,53 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in
    ...
    ValueError: Cannot operate inplace if there is no assignment
 
+.. _whatsnew_0210.api_breaking.iteration_scalars:
+
+Iteration of Series/Index will now return python scalars
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affect int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`).
+
+.. ipython:: python
+
+   s = Series([1, 2, 3])
+   s
+
+Previously:
+
+.. code-block:: python
+
+   In [2]: type(list(s)[0])
+   Out[2]: numpy.int64
+
+New Behaviour:
+
+.. ipython:: python
+
+   type(list(s)[0])
+
+Furthermore this will now correctly box the results of iteration for :func:`DataFrame.to_dict` as well.
+
+.. ipython:: python
+
+   d = {'a':[1], 'b':['b']}
+   df = DataFrame(d)
+
+Previously:
+
+.. code-block:: python
+
+   In [8]: type(df.to_dict()['a'][0])
+   Out[8]: numpy.int64
+
+New Behaviour:
+
+.. ipython:: python
+
+   type(df.to_dict()['a'][0])
+
+.. _whatsnew_0210.api_breaking.dtype_conversions:
+
 Dtype Conversions
 ^^^^^^^^^^^^^^^^^
 

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -8,7 +8,12 @@
 
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass
-from pandas.core.dtypes.common import is_object_dtype, is_list_like, is_scalar
+from pandas.core.dtypes.common import (
+    is_object_dtype,
+    is_list_like,
+    is_scalar,
+    is_datetimelike)
+
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core import common as com
@@ -18,7 +23,8 @@
 from pandas.compat import PYPY
 from pandas.util._decorators import (Appender, cache_readonly,
                                      deprecate_kwarg, Substitution)
-from pandas.core.common import AbstractMethodError
+from pandas.core.common import AbstractMethodError, _maybe_box_datetimelike
+
 from pandas.core.accessor import DirNamesMixin
 
 _shared_docs = dict()
@@ -884,6 +890,21 @@ def argmin(self, axis=None):
         """
         return nanops.nanargmin(self.values)
 
+    def tolist(self):
+        """
+        return a list of the values; box to scalars
+        """
+        return list(self.__iter__())
+
+    def __iter__(self):
+        """
+        provide iteration over the values; box to scalars
+        """
+        if is_datetimelike(self):
+            return (_maybe_box_datetimelike(x) for x in self._values)
+        else:
+            return iter(self._values.tolist())
+
     @cache_readonly
     def hasnans(self):
         """ return if I have any nans; enables various perf speedups """

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -399,6 +399,12 @@ def itemsize(self):
         """ return the size of a single category """
         return self.categories.itemsize
 
+    def tolist(self):
+        """
+        return a list of my values
+        """
+        return np.array(self).tolist()
+
     def reshape(self, new_shape, *args, **kwargs):
         """
         .. deprecated:: 0.19.0

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -585,12 +585,6 @@ def memory_usage(self, deep=False):
         return result
 
     # ops compat
-    def tolist(self):
-        """
-        return a list of the Index values
-        """
-        return list(self.values)
-
     @deprecate_kwarg(old_arg_name='n', new_arg_name='repeats')
     def repeat(self, repeats, *args, **kwargs):
         """
@@ -1601,9 +1595,6 @@ def is_all_dates(self):
             return False
         return is_datetime_array(_ensure_object(self.values))
 
-    def __iter__(self):
-        return iter(self.values)
-
     def __reduce__(self):
         d = dict(data=self._data)
         d.update(self._get_attributes_dict())

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -253,6 +253,10 @@ def get_values(self):
         """ return the underlying data as an ndarray """
         return self._data.get_values()
 
+    def __iter__(self):
+        """ iterate like Categorical """
+        return self._data.__iter__()
+
     @property
     def codes(self):
         return self._data.codes

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -19,7 +19,6 @@
     is_integer, is_integer_dtype,
     is_float_dtype,
     is_extension_type, is_datetimetz,
-    is_datetimelike,
     is_datetime64tz_dtype,
     is_timedelta64_dtype,
     is_list_like,
@@ -1095,14 +1094,6 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True,
                 with open(buf, 'w') as f:
                     f.write(result)
 
-    def __iter__(self):
-        """ provide iteration over the values of the Series
-        box values if necessary """
-        if is_datetimelike(self):
-            return (_maybe_box_datetimelike(x) for x in self._values)
-        else:
-            return iter(self._values)
-
     def iteritems(self):
         """
         Lazily iterate over (index, value) tuples
@@ -1118,10 +1109,6 @@ def keys(self):
         """Alias for index"""
         return self.index
 
-    def tolist(self):
-        """ Convert Series to a nested list """
-        return list(self.asobject)
-
     def to_dict(self, into=dict):
         """
         Convert Series to {label -> value} dict or dict-like object.

diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py
@@ -407,8 +407,18 @@ def to_dense(self, fill=None):
         return self.values
 
     def __iter__(self):
+        if np.issubdtype(self.dtype, np.floating):
+            boxer = float
+        elif np.issubdtype(self.dtype, np.integer):
+            boxer = int
+        else:
+            boxer = lambda x: x
+
         for i in range(len(self)):
-            yield self._get_val_at(i)
+            r = self._get_val_at(i)
+
+            # box em
+            yield boxer(r)
 
     def __getitem__(self, key):
         """

diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
@@ -9,7 +9,7 @@
 import sys
 from distutils.version import LooseVersion
 
-from pandas.compat import range, lrange
+from pandas.compat import range, lrange, long
 from pandas import compat
 
 from numpy.random import randn
@@ -205,15 +205,18 @@ def test_itertuples(self):
                          'ints': lrange(5)}, columns=['floats', 'ints'])
 
         for tup in df.itertuples(index=False):
-            assert isinstance(tup[1], np.integer)
+            assert isinstance(tup[1], (int, long))
 
         df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
         dfaa = df[['a', 'a']]
 
         assert (list(dfaa.itertuples()) ==
                 [(0, 1, 1), (1, 2, 2), (2, 3, 3)])
-        assert (repr(list(df.itertuples(name=None))) ==
-                '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')
+
+        # repr with be int/long on windows
+        if not compat.is_platform_windows():
+            assert (repr(list(df.itertuples(name=None))) ==
+                    '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')
 
         tup = next(df.itertuples(name='TestName'))
 

diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from pandas import compat
+from pandas.compat import long
 from pandas import (DataFrame, Series, MultiIndex, Timestamp,
                     date_range)
 
@@ -236,3 +237,15 @@ def test_to_records_datetimeindex_with_tz(self, tz):
 
         # both converted to UTC, so they are equal
         tm.assert_numpy_array_equal(result, expected)
+
+    def test_to_dict_box_scalars(self):
+        # 14216
+        # make sure that we are boxing properly
+        d = {'a': [1], 'b': ['b']}
+
+        result = DataFrame(d).to_dict()
+        assert isinstance(list(result['a'])[0], (int, long))
+        assert isinstance(list(result['b'])[0], (int, long))
+
+        result = DataFrame(d).to_dict(orient='records')
+        assert isinstance(result[0]['a'], (int, long))
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
@@ -10,7 +10,7 @@
 
 from pandas import Series, DataFrame
 
-from pandas.compat import StringIO, u, long
+from pandas.compat import StringIO, u
 from pandas.util.testing import (assert_series_equal, assert_almost_equal,
                                  assert_frame_equal, ensure_clean)
 import pandas.util.testing as tm
@@ -178,37 +178,3 @@ def test_to_dict(self, mapping):
         from_method = Series(ts.to_dict(collections.Counter))
         from_constructor = Series(collections.Counter(ts.iteritems()))
         tm.assert_series_equal(from_method, from_constructor)
-
-
-class TestSeriesToList(TestData):
-
-    def test_tolist(self):
-        rs = self.ts.tolist()
-        xp = self.ts.values.tolist()
-        assert_almost_equal(rs, xp)
-
-        # datetime64
-        s = Series(self.ts.index)
-        rs = s.tolist()
-        assert self.ts.index[0] == rs[0]
-
-    def test_tolist_np_int(self):
-        # GH10904
-        for t in ['int8', 'int16', 'int32', 'int64']:
-            s = pd.Series([1], dtype=t)
-            assert isinstance(s.tolist()[0], (int, long))
-
-    def test_tolist_np_uint(self):
-        # GH10904
-        for t in ['uint8', 'uint16']:
-            s = pd.Series([1], dtype=t)
-            assert isinstance(s.tolist()[0], int)
-        for t in ['uint32', 'uint64']:
-            s = pd.Series([1], dtype=t)
-            assert isinstance(s.tolist()[0], long)
-
-    def test_tolist_np_float(self):
-        # GH10904
-        for t in ['float16', 'float32', 'float64']:
-            s = pd.Series([1], dtype=t)
-            assert isinstance(s.tolist()[0], float)
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -13,9 +13,10 @@
     is_object_dtype, is_datetimetz,
     needs_i8_conversion)
 import pandas.util.testing as tm
-from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex,
-                    Timedelta, IntervalIndex, Interval)
-from pandas.compat import StringIO, PYPY
+from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex,
+                    PeriodIndex, Timedelta, IntervalIndex, Interval,
+                    CategoricalIndex, Timestamp)
+from pandas.compat import StringIO, PYPY, long
 from pandas.compat.numpy import np_array_datetime64_compat
 from pandas.core.base import PandasDelegate, NoNewAttributesMixin
 from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
@@ -433,7 +434,7 @@ def test_value_counts_unique_nunique(self):
                 # datetimetz Series returns array of Timestamp
                 assert result[0] == orig[0]
                 for r in result:
-                    assert isinstance(r, pd.Timestamp)
+                    assert isinstance(r, Timestamp)
                 tm.assert_numpy_array_equal(result,
                                             orig._values.asobject.values)
             else:
@@ -1031,3 +1032,73 @@ def f():
 
         pytest.raises(AttributeError, f)
         assert not hasattr(t, "b")
+
+
+class TestToIterable(object):
+    # test that we convert an iterable to python types
+
+    dtypes = [
+        ('int8', (int, long)),
+        ('int16', (int, long)),
+        ('int32', (int, long)),
+        ('int64', (int, long)),
+        ('uint8', (int, long)),
+        ('uint16', (int, long)),
+        ('uint32', (int, long)),
+        ('uint64', (int, long)),
+        ('float16', float),
+        ('float32', float),
+        ('float64', float),
+        ('datetime64[ns]', Timestamp),
+        ('datetime64[ns, US/Eastern]', Timestamp),
+        ('timedelta64[ns]', Timedelta)]
+
+    @pytest.mark.parametrize(
+        'dtype, rdtype',
+        dtypes + [
+            ('object', object),
+            ('category', object)])
+    @pytest.mark.parametrize(
+        'method',
+        [
+            lambda x: x.tolist(),
+            lambda x: list(x),
+            lambda x: list(x.__iter__()),
+        ], ids=['tolist', 'list', 'iter'])
+    @pytest.mark.parametrize('typ', [Series, Index])
+    def test_iterable(self, typ, method, dtype, rdtype):
+        # gh-10904
+        # gh-13258
+        # coerce iteration to underlying python / pandas types
+        s = typ([1], dtype=dtype)
+        result = method(s)[0]
+        assert isinstance(result, rdtype)
+
+    @pytest.mark.parametrize(
+        'dtype, rdtype',
+        dtypes + [
+            ('object', (int, long)),
+            ('category', (int, long))])
+    @pytest.mark.parametrize('typ', [Series, Index])
+    def test_iterable_map(self, typ, dtype, rdtype):
+        # gh-13236
+        # coerce iteration to underlying python / pandas types
+        s = typ([1], dtype=dtype)
+        result = s.map(type)[0]
+        if not isinstance(rdtype, tuple):
+            rdtype = tuple([rdtype])
+        assert result in rdtype
+
+    @pytest.mark.parametrize(
+        'method',
+        [
+            lambda x: x.tolist(),
+            lambda x: list(x),
+            lambda x: list(x.__iter__()),
+        ], ids=['tolist', 'list', 'iter'])
+    def test_categorial_datetimelike(self, method):
+        i = CategoricalIndex([Timestamp('1999-12-31'),
+                              Timestamp('2000-12-31')])
+
+        result = method(i)[0]
+        assert isinstance(result, Timestamp)