Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor index-as-string groupby tests and fix spurious warning (Bug 17383) #17843

Merged
merged 5 commits into from
Oct 14, 2017
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,7 @@ Groupby/Resample/Rolling
- Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`)
- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`)
- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`)
- Bug in ``DataFrame.groupby`` where spurious warning is raised when ``Grouper`` object is used to override ambiguous column name (:issue:`17383`)
- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`)

Sparse
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2704,7 +2704,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,

# a passed-in Grouper, directly convert
if isinstance(key, Grouper):
binner, grouper, obj = key._get_grouper(obj)
binner, grouper, obj = key._get_grouper(obj, validate=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note that I had to add this flag to 'fix' this warning issue elsewhere, I don't really like it, but would require more refactoring to make this cleaner.

if key.key is None:
return grouper, [], obj
else:
Expand Down
152 changes: 0 additions & 152 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,158 +253,6 @@ def test_grouper_column_and_index(self):
expected = df_single.reset_index().groupby(['inner', 'B']).mean()
assert_frame_equal(result, expected)

def test_grouper_index_level_as_string(self):
# GH 5677, allow strings passed as the `by` parameter to reference
# columns or index levels

idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
('b', 1), ('b', 2), ('b', 3)])
idx.names = ['outer', 'inner']
df_multi = pd.DataFrame({"A": np.arange(6),
'B': ['one', 'one', 'two',
'two', 'one', 'one']},
index=idx)

df_single = df_multi.reset_index('outer')

# Column and Index on MultiIndex
result = df_multi.groupby(['B', 'inner']).mean()
expected = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
assert_frame_equal(result, expected)

# Index and Column on MultiIndex
result = df_multi.groupby(['inner', 'B']).mean()
expected = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
assert_frame_equal(result, expected)

# Column and Index on single Index
result = df_single.groupby(['B', 'inner']).mean()
expected = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
assert_frame_equal(result, expected)

# Index and Column on single Index
result = df_single.groupby(['inner', 'B']).mean()
expected = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
assert_frame_equal(result, expected)

# Single element list of Index on MultiIndex
result = df_multi.groupby(['inner']).mean()
expected = df_multi.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

# Single element list of Index on single Index
result = df_single.groupby(['inner']).mean()
expected = df_single.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

# Index on MultiIndex
result = df_multi.groupby('inner').mean()
expected = df_multi.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

# Index on single Index
result = df_single.groupby('inner').mean()
expected = df_single.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

def test_grouper_column_index_level_precedence(self):
# GH 5677, when a string passed as the `by` parameter
# matches a column and an index level the column takes
# precedence

idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
('b', 1), ('b', 2), ('b', 3)])
idx.names = ['outer', 'inner']
df_multi_both = pd.DataFrame({"A": np.arange(6),
'B': ['one', 'one', 'two',
'two', 'one', 'one'],
'inner': [1, 1, 1, 1, 1, 1]},
index=idx)

df_single_both = df_multi_both.reset_index('outer')

# Group MultiIndex by single key
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_multi_both.groupby('inner').mean()

expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pd.Grouper object in this expected expression shouldn't have been wrapped in a list. If it had not been, the spurious warning would have been raised in this test. This is corrected in the new test below.

assert_frame_equal(result, expected)
not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean()
assert not result.index.equals(not_expected.index)

# Group single Index by single key
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_single_both.groupby('inner').mean()

expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean()
assert not result.index.equals(not_expected.index)

# Group MultiIndex by single key list
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_multi_both.groupby(['inner']).mean()

expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean()
assert not result.index.equals(not_expected.index)

# Group single Index by single key list
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_single_both.groupby(['inner']).mean()

expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean()
assert not result.index.equals(not_expected.index)

# Group MultiIndex by two keys (1)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_multi_both.groupby(['B', 'inner']).mean()

expected = df_multi_both.groupby(['B',
pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_multi_both.groupby(['B',
pd.Grouper(level='inner')
]).mean()
assert not result.index.equals(not_expected.index)

# Group MultiIndex by two keys (2)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_multi_both.groupby(['inner', 'B']).mean()

expected = df_multi_both.groupby([pd.Grouper(key='inner'),
'B']).mean()
assert_frame_equal(result, expected)
not_expected = df_multi_both.groupby([pd.Grouper(level='inner'),
'B']).mean()
assert not result.index.equals(not_expected.index)

# Group single Index by two keys (1)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_single_both.groupby(['B', 'inner']).mean()

expected = df_single_both.groupby(['B',
pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_single_both.groupby(['B',
pd.Grouper(level='inner')
]).mean()
assert not result.index.equals(not_expected.index)

# Group single Index by two keys (2)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_single_both.groupby(['inner', 'B']).mean()

expected = df_single_both.groupby([pd.Grouper(key='inner'),
'B']).mean()
assert_frame_equal(result, expected)
not_expected = df_single_both.groupby([pd.Grouper(level='inner'),
'B']).mean()
assert not result.index.equals(not_expected.index)

def test_grouper_getting_correct_binner(self):

# GH 10063
Expand Down
142 changes: 142 additions & 0 deletions pandas/tests/groupby/test_index_as_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import pytest
import pandas as pd
import numpy as np

from pandas.util.testing import assert_frame_equal, assert_series_equal
import pandas.util.testing as tm


def build_df_multi():
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these should just be fixtures

('b', 1), ('b', 2), ('b', 3)])
idx.names = ['outer', 'inner']
df_multi = pd.DataFrame({"A": np.arange(6),
'B': ['one', 'one', 'two',
'two', 'one', 'one']},
index=idx)
return df_multi


def build_df_single():
df_single = build_df_multi().reset_index('outer')
return df_single


def build_test_series():
series_multi = build_df_multi().set_index('B', append=True)['A']
return series_multi


class TestGroupByIndexAsString(object):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no real need to make this a class, that is really a leftover from nose, just make these functions (of course a class is good for grouping generally)


@pytest.mark.parametrize('frame', [build_df_multi(), build_df_single()])
def test_grouper_index_level_as_string(self, frame):
# Column and Index
result = frame.groupby(['B', 'inner']).mean()
expected = frame.groupby(['B', pd.Grouper(level='inner')]).mean()
assert_frame_equal(result, expected)

# Index and Column
result = frame.groupby(['inner', 'B']).mean()
expected = frame.groupby([pd.Grouper(level='inner'), 'B']).mean()
assert_frame_equal(result, expected)

# Single element list of Index
result = frame.groupby(['inner']).mean()
expected = frame.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

# Index name
result = frame.groupby('inner').mean()
expected = frame.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

@pytest.mark.parametrize('levels', [
'inner', 'outer', 'B',
['inner'], ['outer'], ['B'],
['inner', 'outer'], ['inner', 'outer', 'B']
])
def test_grouper_index_level_as_string_series(self, levels):
s = build_test_series()

# Compute expected result
if isinstance(levels, list):
groupers = [pd.Grouper(level=lv) for lv in levels]
else:
groupers = pd.Grouper(level=levels)

expected = s.groupby(groupers).mean()

# Compute and check result
result = s.groupby(levels).mean()
assert_series_equal(result, expected)

@pytest.mark.parametrize('frame', [build_df_multi(), build_df_single()])
def test_grouper_column_index_level_precedence(self, frame):
# GH 5677, when a string passed as the `by` parameter
# matches a column and an index level the column takes
# precedence

# Add 'inner' column to frame
# (frame already has an 'inner' index)
frame['inner'] = [1, 1, 1, 1, 1, 1]

# Group by single key
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = frame.groupby('inner').mean()

with tm.assert_produces_warning(False):
expected = frame.groupby(pd.Grouper(key='inner')).mean()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that the pd.Grouper object is no longer wrapped in a list and that we now assert that no warning is raised. This is the test case that would have failed without the fix in this PR.


assert_frame_equal(result, expected)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

happy to have even another level of parameterization to make these shorter here (if that's possible)

with tm.assert_produces_warning(False):
not_expected = frame.groupby(pd.Grouper(level='inner')).mean()

assert not result.index.equals(not_expected.index)

# Group by single key list
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = frame.groupby(['inner']).mean()

with tm.assert_produces_warning(False):
expected = frame.groupby([pd.Grouper(key='inner')]).mean()

assert_frame_equal(result, expected)

with tm.assert_produces_warning(False):
not_expected = frame.groupby(pd.Grouper(level='inner')).mean()

assert not result.index.equals(not_expected.index)

# Group by two keys ('B', 'inner')
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = frame.groupby(['B', 'inner']).mean()

with tm.assert_produces_warning(False):
expected = frame.groupby(['B',
pd.Grouper(key='inner')]).mean()

assert_frame_equal(result, expected)

with tm.assert_produces_warning(False):
not_expected = frame.groupby(['B',
pd.Grouper(level='inner')
]).mean()

assert not result.index.equals(not_expected.index)

# Group by two keys ('inner', 'B')
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = frame.groupby(['inner', 'B']).mean()

with tm.assert_produces_warning(False):
expected = frame.groupby([pd.Grouper(key='inner'),
'B']).mean()

assert_frame_equal(result, expected)

with tm.assert_produces_warning(False):
not_expected = frame.groupby([pd.Grouper(level='inner'),
'B']).mean()
assert not result.index.equals(not_expected.index)