From 05f8a6f6b192e9037747d2d0251b8506b53a70d1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 10 Sep 2017 16:58:05 -0400 Subject: [PATCH] COMPAT: Iteration should always yield a python scalar xref #10904 closes #13236 closes #13256 xref #14216 --- doc/source/whatsnew/v0.21.0.txt | 47 ++++++++++++++++ pandas/core/base.py | 25 ++++++++- pandas/core/categorical.py | 6 ++ pandas/core/indexes/base.py | 9 --- pandas/core/indexes/category.py | 4 ++ pandas/core/series.py | 13 ----- pandas/core/sparse/array.py | 12 +++- pandas/tests/frame/test_api.py | 11 ++-- pandas/tests/frame/test_convert_to.py | 13 +++++ pandas/tests/series/test_io.py | 36 +----------- pandas/tests/test_base.py | 79 +++++++++++++++++++++++++-- 11 files changed, 187 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index eccd71f45ec27..4cea36569e8e4 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -188,6 +188,53 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in ... ValueError: Cannot operate inplace if there is no assignment +.. _whatsnew_0210.api_breaking.iteration_scalars: + +Iteration of Series/Index will now return python scalars +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affect int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`). + +.. ipython:: python + + s = Series([1, 2, 3]) + s + +Previously: + +.. code-block:: python + + In [2]: type(list(s)[0]) + Out[2]: numpy.int64 + +New Behaviour: + +.. ipython:: python + + type(list(s)[0]) + +Furthermore this will now correctly box the results of iteration for :func:`DataFrame.to_dict` as well. + +.. ipython:: python + + d = {'a':[1], 'b':['b']} + df = DataFrame(d) + +Previously: + +.. code-block:: python + + In [8]: type(df.to_dict()['a'][0]) + Out[8]: numpy.int64 + +New Behaviour: + +.. ipython:: python + + type(df.to_dict()['a'][0]) + +.. _whatsnew_0210.api_breaking.dtype_conversions: + Dtype Conversions ^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index d60a8515dc920..62d89eac4b354 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -8,7 +8,12 @@ from pandas.core.dtypes.missing import isna from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass -from pandas.core.dtypes.common import is_object_dtype, is_list_like, is_scalar +from pandas.core.dtypes.common import ( + is_object_dtype, + is_list_like, + is_scalar, + is_datetimelike) + from pandas.util._validators import validate_bool_kwarg from pandas.core import common as com @@ -18,7 +23,8 @@ from pandas.compat import PYPY from pandas.util._decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) -from pandas.core.common import AbstractMethodError +from pandas.core.common import AbstractMethodError, _maybe_box_datetimelike + from pandas.core.accessor import DirNamesMixin _shared_docs = dict() @@ -884,6 +890,21 @@ def argmin(self, axis=None): """ return nanops.nanargmin(self.values) + def tolist(self): + """ + return a list of the values; box to scalars + """ + return list(self.__iter__()) + + def __iter__(self): + """ + provide iteration over the values; box to scalars + """ + if is_datetimelike(self): + return (_maybe_box_datetimelike(x) for x in self._values) + else: + return iter(self._values.tolist()) + @cache_readonly def hasnans(self): """ return if I have any nans; enables various perf speedups """ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1c2a29333001c..dbd2a79b7e46d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -399,6 +399,12 @@ def itemsize(self): """ return the size of a single category """ return self.categories.itemsize + def tolist(self): + """ + return a list of my values + """ + return np.array(self).tolist() + def reshape(self, new_shape, *args, **kwargs): """ .. deprecated:: 0.19.0 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ef5f68936044a..008828cf4f309 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -585,12 +585,6 @@ def memory_usage(self, deep=False): return result # ops compat - def tolist(self): - """ - return a list of the Index values - """ - return list(self.values) - @deprecate_kwarg(old_arg_name='n', new_arg_name='repeats') def repeat(self, repeats, *args, **kwargs): """ @@ -1601,9 +1595,6 @@ def is_all_dates(self): return False return is_datetime_array(_ensure_object(self.values)) - def __iter__(self): - return iter(self.values) - def __reduce__(self): d = dict(data=self._data) d.update(self._get_attributes_dict()) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0681202289311..c8044b14e4e57 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -253,6 +253,10 @@ def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() + def __iter__(self): + """ iterate like Categorical """ + return self._data.__iter__() + @property def codes(self): return self._data.codes diff --git a/pandas/core/series.py b/pandas/core/series.py index 6905fc1aced74..ac11c5f908fdc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,7 +19,6 @@ is_integer, is_integer_dtype, is_float_dtype, is_extension_type, is_datetimetz, - is_datetimelike, is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, @@ -1095,14 +1094,6 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, with open(buf, 'w') as f: f.write(result) - def __iter__(self): - """ provide iteration over the values of the Series - box values if necessary """ - if is_datetimelike(self): - return (_maybe_box_datetimelike(x) for x in self._values) - else: - return iter(self._values) - def iteritems(self): """ Lazily iterate over (index, value) tuples @@ -1118,10 +1109,6 @@ def keys(self): """Alias for index""" return self.index - def tolist(self): - """ Convert Series to a nested list """ - return list(self.asobject) - def to_dict(self, into=dict): """ Convert Series to {label -> value} dict or dict-like object. diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 2f830a98db649..f965c91999a03 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -407,8 +407,18 @@ def to_dense(self, fill=None): return self.values def __iter__(self): + if np.issubdtype(self.dtype, np.floating): + boxer = float + elif np.issubdtype(self.dtype, np.integer): + boxer = int + else: + boxer = lambda x: x + for i in range(len(self)): - yield self._get_val_at(i) + r = self._get_val_at(i) + + # box em + yield boxer(r) def __getitem__(self, key): """ diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a62fcb506a34b..b3209da6449d6 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -9,7 +9,7 @@ import sys from distutils.version import LooseVersion -from pandas.compat import range, lrange +from pandas.compat import range, lrange, long from pandas import compat from numpy.random import randn @@ -205,15 +205,18 @@ def test_itertuples(self): 'ints': lrange(5)}, columns=['floats', 'ints']) for tup in df.itertuples(index=False): - assert isinstance(tup[1], np.integer) + assert isinstance(tup[1], (int, long)) df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[['a', 'a']] assert (list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) - assert (repr(list(df.itertuples(name=None))) == - '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') + + # repr with be int/long on windows + if not compat.is_platform_windows(): + assert (repr(list(df.itertuples(name=None))) == + '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') tup = next(df.itertuples(name='TestName')) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 629c695b702fe..99e5630ce6a43 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -5,6 +5,7 @@ import numpy as np from pandas import compat +from pandas.compat import long from pandas import (DataFrame, Series, MultiIndex, Timestamp, date_range) @@ -236,3 +237,15 @@ def test_to_records_datetimeindex_with_tz(self, tz): # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) + + def test_to_dict_box_scalars(self): + # 14216 + # make sure that we are boxing properly + d = {'a': [1], 'b': ['b']} + + result = DataFrame(d).to_dict() + assert isinstance(list(result['a'])[0], (int, long)) + assert isinstance(list(result['b'])[0], (int, long)) + + result = DataFrame(d).to_dict(orient='records') + assert isinstance(result[0]['a'], (int, long)) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 503185de427f1..5b7fd1ec94a90 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -10,7 +10,7 @@ from pandas import Series, DataFrame -from pandas.compat import StringIO, u, long +from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -178,37 +178,3 @@ def test_to_dict(self, mapping): from_method = Series(ts.to_dict(collections.Counter)) from_constructor = Series(collections.Counter(ts.iteritems())) tm.assert_series_equal(from_method, from_constructor) - - -class TestSeriesToList(TestData): - - def test_tolist(self): - rs = self.ts.tolist() - xp = self.ts.values.tolist() - assert_almost_equal(rs, xp) - - # datetime64 - s = Series(self.ts.index) - rs = s.tolist() - assert self.ts.index[0] == rs[0] - - def test_tolist_np_int(self): - # GH10904 - for t in ['int8', 'int16', 'int32', 'int64']: - s = pd.Series([1], dtype=t) - assert isinstance(s.tolist()[0], (int, long)) - - def test_tolist_np_uint(self): - # GH10904 - for t in ['uint8', 'uint16']: - s = pd.Series([1], dtype=t) - assert isinstance(s.tolist()[0], int) - for t in ['uint32', 'uint64']: - s = pd.Series([1], dtype=t) - assert isinstance(s.tolist()[0], long) - - def test_tolist_np_float(self): - # GH10904 - for t in ['float16', 'float32', 'float64']: - s = pd.Series([1], dtype=t) - assert isinstance(s.tolist()[0], float) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 9e92c7cf1a9b8..210d0260b8d95 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -13,9 +13,10 @@ is_object_dtype, is_datetimetz, needs_i8_conversion) import pandas.util.testing as tm -from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, - Timedelta, IntervalIndex, Interval) -from pandas.compat import StringIO, PYPY +from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, + PeriodIndex, Timedelta, IntervalIndex, Interval, + CategoricalIndex, Timestamp) +from pandas.compat import StringIO, PYPY, long from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import PandasDelegate, NoNewAttributesMixin from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -433,7 +434,7 @@ def test_value_counts_unique_nunique(self): # datetimetz Series returns array of Timestamp assert result[0] == orig[0] for r in result: - assert isinstance(r, pd.Timestamp) + assert isinstance(r, Timestamp) tm.assert_numpy_array_equal(result, orig._values.asobject.values) else: @@ -1031,3 +1032,73 @@ def f(): pytest.raises(AttributeError, f) assert not hasattr(t, "b") + + +class TestToIterable(object): + # test that we convert an iterable to python types + + dtypes = [ + ('int8', (int, long)), + ('int16', (int, long)), + ('int32', (int, long)), + ('int64', (int, long)), + ('uint8', (int, long)), + ('uint16', (int, long)), + ('uint32', (int, long)), + ('uint64', (int, long)), + ('float16', float), + ('float32', float), + ('float64', float), + ('datetime64[ns]', Timestamp), + ('datetime64[ns, US/Eastern]', Timestamp), + ('timedelta64[ns]', Timedelta)] + + @pytest.mark.parametrize( + 'dtype, rdtype', + dtypes + [ + ('object', object), + ('category', object)]) + @pytest.mark.parametrize( + 'method', + [ + lambda x: x.tolist(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], ids=['tolist', 'list', 'iter']) + @pytest.mark.parametrize('typ', [Series, Index]) + def test_iterable(self, typ, method, dtype, rdtype): + # gh-10904 + # gh-13258 + # coerce iteration to underlying python / pandas types + s = typ([1], dtype=dtype) + result = method(s)[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize( + 'dtype, rdtype', + dtypes + [ + ('object', (int, long)), + ('category', (int, long))]) + @pytest.mark.parametrize('typ', [Series, Index]) + def test_iterable_map(self, typ, dtype, rdtype): + # gh-13236 + # coerce iteration to underlying python / pandas types + s = typ([1], dtype=dtype) + result = s.map(type)[0] + if not isinstance(rdtype, tuple): + rdtype = tuple([rdtype]) + assert result in rdtype + + @pytest.mark.parametrize( + 'method', + [ + lambda x: x.tolist(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], ids=['tolist', 'list', 'iter']) + def test_categorial_datetimelike(self, method): + i = CategoricalIndex([Timestamp('1999-12-31'), + Timestamp('2000-12-31')]) + + result = method(i)[0] + assert isinstance(result, Timestamp)