Skip to content

Commit

Permalink
COMPAT: Iteration should always yield a python scalar (pandas-dev#17491)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback authored Sep 12, 2017
1 parent e682902 commit 83436af
Show file tree
Hide file tree
Showing 11 changed files with 187 additions and 68 deletions.
47 changes: 47 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,53 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in
...
ValueError: Cannot operate inplace if there is no assignment

.. _whatsnew_0210.api_breaking.iteration_scalars:

Iteration of Series/Index will now return python scalars
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affect int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`).

.. ipython:: python

s = Series([1, 2, 3])
s

Previously:

.. code-block:: python

In [2]: type(list(s)[0])
Out[2]: numpy.int64

New Behaviour:

.. ipython:: python

type(list(s)[0])

Furthermore this will now correctly box the results of iteration for :func:`DataFrame.to_dict` as well.

.. ipython:: python

d = {'a':[1], 'b':['b']}
df = DataFrame(d)

Previously:

.. code-block:: python

In [8]: type(df.to_dict()['a'][0])
Out[8]: numpy.int64

New Behaviour:

.. ipython:: python

type(df.to_dict()['a'][0])

.. _whatsnew_0210.api_breaking.dtype_conversions:

Dtype Conversions
^^^^^^^^^^^^^^^^^

Expand Down
25 changes: 23 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@

from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass
from pandas.core.dtypes.common import is_object_dtype, is_list_like, is_scalar
from pandas.core.dtypes.common import (
is_object_dtype,
is_list_like,
is_scalar,
is_datetimelike)

from pandas.util._validators import validate_bool_kwarg

from pandas.core import common as com
Expand All @@ -18,7 +23,8 @@
from pandas.compat import PYPY
from pandas.util._decorators import (Appender, cache_readonly,
deprecate_kwarg, Substitution)
from pandas.core.common import AbstractMethodError
from pandas.core.common import AbstractMethodError, _maybe_box_datetimelike

from pandas.core.accessor import DirNamesMixin

_shared_docs = dict()
Expand Down Expand Up @@ -884,6 +890,21 @@ def argmin(self, axis=None):
"""
return nanops.nanargmin(self.values)

def tolist(self):
"""
return a list of the values; box to scalars
"""
return list(self.__iter__())

def __iter__(self):
"""
provide iteration over the values; box to scalars
"""
if is_datetimelike(self):
return (_maybe_box_datetimelike(x) for x in self._values)
else:
return iter(self._values.tolist())

@cache_readonly
def hasnans(self):
""" return if I have any nans; enables various perf speedups """
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,12 @@ def itemsize(self):
""" return the size of a single category """
return self.categories.itemsize

def tolist(self):
"""
return a list of my values
"""
return np.array(self).tolist()

def reshape(self, new_shape, *args, **kwargs):
"""
.. deprecated:: 0.19.0
Expand Down
9 changes: 0 additions & 9 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,12 +585,6 @@ def memory_usage(self, deep=False):
return result

# ops compat
def tolist(self):
"""
return a list of the Index values
"""
return list(self.values)

@deprecate_kwarg(old_arg_name='n', new_arg_name='repeats')
def repeat(self, repeats, *args, **kwargs):
"""
Expand Down Expand Up @@ -1601,9 +1595,6 @@ def is_all_dates(self):
return False
return is_datetime_array(_ensure_object(self.values))

def __iter__(self):
return iter(self.values)

def __reduce__(self):
d = dict(data=self._data)
d.update(self._get_attributes_dict())
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,10 @@ def get_values(self):
""" return the underlying data as an ndarray """
return self._data.get_values()

def __iter__(self):
""" iterate like Categorical """
return self._data.__iter__()

@property
def codes(self):
return self._data.codes
Expand Down
13 changes: 0 additions & 13 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
is_integer, is_integer_dtype,
is_float_dtype,
is_extension_type, is_datetimetz,
is_datetimelike,
is_datetime64tz_dtype,
is_timedelta64_dtype,
is_list_like,
Expand Down Expand Up @@ -1095,14 +1094,6 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True,
with open(buf, 'w') as f:
f.write(result)

def __iter__(self):
""" provide iteration over the values of the Series
box values if necessary """
if is_datetimelike(self):
return (_maybe_box_datetimelike(x) for x in self._values)
else:
return iter(self._values)

def iteritems(self):
"""
Lazily iterate over (index, value) tuples
Expand All @@ -1118,10 +1109,6 @@ def keys(self):
"""Alias for index"""
return self.index

def tolist(self):
""" Convert Series to a nested list """
return list(self.asobject)

def to_dict(self, into=dict):
"""
Convert Series to {label -> value} dict or dict-like object.
Expand Down
12 changes: 11 additions & 1 deletion pandas/core/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,18 @@ def to_dense(self, fill=None):
return self.values

def __iter__(self):
if np.issubdtype(self.dtype, np.floating):
boxer = float
elif np.issubdtype(self.dtype, np.integer):
boxer = int
else:
boxer = lambda x: x

for i in range(len(self)):
yield self._get_val_at(i)
r = self._get_val_at(i)

# box em
yield boxer(r)

def __getitem__(self, key):
"""
Expand Down
11 changes: 7 additions & 4 deletions pandas/tests/frame/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import sys
from distutils.version import LooseVersion

from pandas.compat import range, lrange
from pandas.compat import range, lrange, long
from pandas import compat

from numpy.random import randn
Expand Down Expand Up @@ -205,15 +205,18 @@ def test_itertuples(self):
'ints': lrange(5)}, columns=['floats', 'ints'])

for tup in df.itertuples(index=False):
assert isinstance(tup[1], np.integer)
assert isinstance(tup[1], (int, long))

df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
dfaa = df[['a', 'a']]

assert (list(dfaa.itertuples()) ==
[(0, 1, 1), (1, 2, 2), (2, 3, 3)])
assert (repr(list(df.itertuples(name=None))) ==
'[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')

# repr with be int/long on windows
if not compat.is_platform_windows():
assert (repr(list(df.itertuples(name=None))) ==
'[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')

tup = next(df.itertuples(name='TestName'))

Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/frame/test_convert_to.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np

from pandas import compat
from pandas.compat import long
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
date_range)

Expand Down Expand Up @@ -236,3 +237,15 @@ def test_to_records_datetimeindex_with_tz(self, tz):

# both converted to UTC, so they are equal
tm.assert_numpy_array_equal(result, expected)

def test_to_dict_box_scalars(self):
# 14216
# make sure that we are boxing properly
d = {'a': [1], 'b': ['b']}

result = DataFrame(d).to_dict()
assert isinstance(list(result['a'])[0], (int, long))
assert isinstance(list(result['b'])[0], (int, long))

result = DataFrame(d).to_dict(orient='records')
assert isinstance(result[0]['a'], (int, long))
36 changes: 1 addition & 35 deletions pandas/tests/series/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from pandas import Series, DataFrame

from pandas.compat import StringIO, u, long
from pandas.compat import StringIO, u
from pandas.util.testing import (assert_series_equal, assert_almost_equal,
assert_frame_equal, ensure_clean)
import pandas.util.testing as tm
Expand Down Expand Up @@ -178,37 +178,3 @@ def test_to_dict(self, mapping):
from_method = Series(ts.to_dict(collections.Counter))
from_constructor = Series(collections.Counter(ts.iteritems()))
tm.assert_series_equal(from_method, from_constructor)


class TestSeriesToList(TestData):

def test_tolist(self):
rs = self.ts.tolist()
xp = self.ts.values.tolist()
assert_almost_equal(rs, xp)

# datetime64
s = Series(self.ts.index)
rs = s.tolist()
assert self.ts.index[0] == rs[0]

def test_tolist_np_int(self):
# GH10904
for t in ['int8', 'int16', 'int32', 'int64']:
s = pd.Series([1], dtype=t)
assert isinstance(s.tolist()[0], (int, long))

def test_tolist_np_uint(self):
# GH10904
for t in ['uint8', 'uint16']:
s = pd.Series([1], dtype=t)
assert isinstance(s.tolist()[0], int)
for t in ['uint32', 'uint64']:
s = pd.Series([1], dtype=t)
assert isinstance(s.tolist()[0], long)

def test_tolist_np_float(self):
# GH10904
for t in ['float16', 'float32', 'float64']:
s = pd.Series([1], dtype=t)
assert isinstance(s.tolist()[0], float)
79 changes: 75 additions & 4 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
is_object_dtype, is_datetimetz,
needs_i8_conversion)
import pandas.util.testing as tm
from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex,
Timedelta, IntervalIndex, Interval)
from pandas.compat import StringIO, PYPY
from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex,
PeriodIndex, Timedelta, IntervalIndex, Interval,
CategoricalIndex, Timestamp)
from pandas.compat import StringIO, PYPY, long
from pandas.compat.numpy import np_array_datetime64_compat
from pandas.core.base import PandasDelegate, NoNewAttributesMixin
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
Expand Down Expand Up @@ -433,7 +434,7 @@ def test_value_counts_unique_nunique(self):
# datetimetz Series returns array of Timestamp
assert result[0] == orig[0]
for r in result:
assert isinstance(r, pd.Timestamp)
assert isinstance(r, Timestamp)
tm.assert_numpy_array_equal(result,
orig._values.asobject.values)
else:
Expand Down Expand Up @@ -1031,3 +1032,73 @@ def f():

pytest.raises(AttributeError, f)
assert not hasattr(t, "b")


class TestToIterable(object):
# test that we convert an iterable to python types

dtypes = [
('int8', (int, long)),
('int16', (int, long)),
('int32', (int, long)),
('int64', (int, long)),
('uint8', (int, long)),
('uint16', (int, long)),
('uint32', (int, long)),
('uint64', (int, long)),
('float16', float),
('float32', float),
('float64', float),
('datetime64[ns]', Timestamp),
('datetime64[ns, US/Eastern]', Timestamp),
('timedelta64[ns]', Timedelta)]

@pytest.mark.parametrize(
'dtype, rdtype',
dtypes + [
('object', object),
('category', object)])
@pytest.mark.parametrize(
'method',
[
lambda x: x.tolist(),
lambda x: list(x),
lambda x: list(x.__iter__()),
], ids=['tolist', 'list', 'iter'])
@pytest.mark.parametrize('typ', [Series, Index])
def test_iterable(self, typ, method, dtype, rdtype):
# gh-10904
# gh-13258
# coerce iteration to underlying python / pandas types
s = typ([1], dtype=dtype)
result = method(s)[0]
assert isinstance(result, rdtype)

@pytest.mark.parametrize(
'dtype, rdtype',
dtypes + [
('object', (int, long)),
('category', (int, long))])
@pytest.mark.parametrize('typ', [Series, Index])
def test_iterable_map(self, typ, dtype, rdtype):
# gh-13236
# coerce iteration to underlying python / pandas types
s = typ([1], dtype=dtype)
result = s.map(type)[0]
if not isinstance(rdtype, tuple):
rdtype = tuple([rdtype])
assert result in rdtype

@pytest.mark.parametrize(
'method',
[
lambda x: x.tolist(),
lambda x: list(x),
lambda x: list(x.__iter__()),
], ids=['tolist', 'list', 'iter'])
def test_categorial_datetimelike(self, method):
i = CategoricalIndex([Timestamp('1999-12-31'),
Timestamp('2000-12-31')])

result = method(i)[0]
assert isinstance(result, Timestamp)

0 comments on commit 83436af

Please sign in to comment.