diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 9f58ee2f8b99b..251d94cbdd911 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1178,8 +1178,7 @@ takes as an argument the columns to use to identify duplicated rows. - ``drop_duplicates`` removes duplicate rows. By default, the first observed row of a duplicate set is considered unique, but -each method has a ``take_last`` parameter that indicates the last observed row -should be taken instead. +each method has a ``keep`` parameter to specify targets to be kept. .. ipython:: python @@ -1187,8 +1186,11 @@ should be taken instead. 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], 'c' : np.random.randn(7)}) df2.duplicated(['a','b']) + df2.duplicated(['a','b'], keep='last') + df2.duplicated(['a','b'], keep=False) df2.drop_duplicates(['a','b']) - df2.drop_duplicates(['a','b'], take_last=True) + df2.drop_duplicates(['a','b'], keep='last') + df2.drop_duplicates(['a','b'], keep=False) An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``. @@ -1199,7 +1201,7 @@ An alternative way to drop duplicates on the index is ``.groupby(level=0)`` comb df3.groupby(level=0).first() # a bit more verbose - df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b') + df3.reset_index().drop_duplicates(subset='b', keep='first').set_index('b') .. _indexing.dictionarylike: diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 86bb78f4066ab..70d616ca72c1b 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -142,6 +142,15 @@ Other enhancements - ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`). - ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`). +- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`) + +.. ipython :: python + + s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D']) + s.drop_duplicates() + s.drop_duplicates(keep='last') + s.drop_duplicates(keep=False) + .. _whatsnew_0170.api: @@ -520,6 +529,7 @@ Deprecations ===================== ================================= - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). +- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`) .. _whatsnew_0170.prior_deprecations: diff --git a/pandas/core/base.py b/pandas/core/base.py index c3004aec60cc5..6d1c89a7a2f89 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -6,7 +6,7 @@ from pandas.core import common as com import pandas.core.nanops as nanops import pandas.lib as lib -from pandas.util.decorators import Appender, cache_readonly +from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg from pandas.core.strings import StringMethods from pandas.core.common import AbstractMethodError @@ -543,8 +543,12 @@ def _dir_deletions(self): Parameters ---------- - take_last : boolean, default False - Take the last observed index in a group. Default first + + keep : {'first', 'last', False}, default 'first' + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + take_last : deprecated %(inplace)s Returns @@ -552,9 +556,10 @@ def _dir_deletions(self): deduplicated : %(klass)s """) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) - def drop_duplicates(self, take_last=False, inplace=False): - duplicated = self.duplicated(take_last=take_last) + def drop_duplicates(self, keep='first', inplace=False): + duplicated = self.duplicated(keep=keep) result = self[np.logical_not(duplicated)] if inplace: return self._update_inplace(result) @@ -566,18 +571,22 @@ def drop_duplicates(self, take_last=False, inplace=False): Parameters ---------- - take_last : boolean, default False - Take the last observed index in a group. Default first + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + take_last : deprecated Returns ------- duplicated : %(duplicated)s """) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) - def duplicated(self, take_last=False): + def duplicated(self, keep='first'): keys = com._ensure_object(self.values) - duplicated = lib.duplicated(keys, take_last=take_last) + duplicated = lib.duplicated(keys, keep=keep) try: return self._constructor(duplicated, index=self.index).__finalize__(self) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d8948bc82fe61..fe9c9bece1f79 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2866,8 +2866,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, else: return result + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset') - def drop_duplicates(self, subset=None, take_last=False, inplace=False): + def drop_duplicates(self, subset=None, keep='first', inplace=False): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns @@ -2877,8 +2878,11 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False): subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns - take_last : boolean, default False - Take the last observed row in a row. Defaults to the first row + keep : {'first', 'last', False}, default 'first' + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + take_last : deprecated inplace : boolean, default False Whether to drop duplicates in place or to return a copy cols : kwargs only argument of subset [deprecated] @@ -2887,7 +2891,7 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False): ------- deduplicated : DataFrame """ - duplicated = self.duplicated(subset, take_last=take_last) + duplicated = self.duplicated(subset, keep=keep) if inplace: inds, = (-duplicated).nonzero() @@ -2896,8 +2900,9 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False): else: return self[-duplicated] + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset') - def duplicated(self, subset=None, take_last=False): + def duplicated(self, subset=None, keep='first'): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns @@ -2907,9 +2912,13 @@ def duplicated(self, subset=None, take_last=False): subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns - take_last : boolean, default False - For a set of distinct duplicate rows, flag all but the last row as - duplicated. Default is for all but the first row to be flagged + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the + first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the + last occurrence. + - False : Mark all duplicates as ``True``. + take_last : deprecated cols : kwargs only argument of subset [deprecated] Returns @@ -2935,7 +2944,7 @@ def f(vals): labels, shape = map(list, zip( * map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - return Series(duplicated_int64(ids, take_last), index=self.index) + return Series(duplicated_int64(ids, keep), index=self.index) #---------------------------------------------------------------------- # Sorting diff --git a/pandas/core/index.py b/pandas/core/index.py index a9631d7aabedd..febcfa37994a3 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -16,7 +16,7 @@ from pandas.lib import Timestamp, Timedelta, is_datetime_array from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate from pandas.util.decorators import (Appender, Substitution, cache_readonly, - deprecate) + deprecate, deprecate_kwarg) import pandas.core.common as com from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, @@ -2628,13 +2628,15 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs) - def drop_duplicates(self, take_last=False): - return super(Index, self).drop_duplicates(take_last=take_last) + def drop_duplicates(self, keep='first'): + return super(Index, self).drop_duplicates(keep=keep) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) - def duplicated(self, take_last=False): - return super(Index, self).duplicated(take_last=take_last) + def duplicated(self, keep='first'): + return super(Index, self).duplicated(keep=keep) def _evaluate_with_timedelta_like(self, other, op, opstr): raise TypeError("can only perform ops with timedelta like values") @@ -3065,10 +3067,11 @@ def _engine(self): def is_unique(self): return not self.duplicated().any() + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) - def duplicated(self, take_last=False): + def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 - return duplicated_int64(self.codes.astype('i8'), take_last) + return duplicated_int64(self.codes.astype('i8'), keep) def get_loc(self, key, method=None): """ @@ -4228,15 +4231,16 @@ def _has_complex_internals(self): def is_unique(self): return not self.duplicated().any() + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) - def duplicated(self, take_last=False): + def duplicated(self, keep='first'): from pandas.core.groupby import get_group_index from pandas.hashtable import duplicated_int64 shape = map(len, self.levels) ids = get_group_index(self.labels, shape, sort=False, xnull=False) - return duplicated_int64(ids, take_last) + return duplicated_int64(ids, keep) def get_value(self, series, key): # somewhat broken encapsulation diff --git a/pandas/core/series.py b/pandas/core/series.py index 6586fa10935e6..87fde996aaa67 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -46,7 +46,7 @@ import pandas.core.datetools as datetools import pandas.core.format as fmt import pandas.core.nanops as nanops -from pandas.util.decorators import Appender, cache_readonly +from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg import pandas.lib as lib import pandas.tslib as tslib @@ -1155,14 +1155,15 @@ def mode(self): from pandas.core.algorithms import mode return mode(self) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) - def drop_duplicates(self, take_last=False, inplace=False): - return super(Series, self).drop_duplicates(take_last=take_last, - inplace=inplace) + def drop_duplicates(self, keep='first', inplace=False): + return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs) - def duplicated(self, take_last=False): - return super(Series, self).duplicated(take_last=take_last) + def duplicated(self, keep='first'): + return super(Series, self).duplicated(keep=keep) def idxmin(self, axis=None, out=None, skipna=True): """ diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 3b3ea9fa032f8..7dbd1b45c938f 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1026,25 +1026,41 @@ def mode_int64(int64_t[:] values): @cython.wraparound(False) @cython.boundscheck(False) -def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last): +def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'): cdef: - int ret = 0 + int ret = 0, value, k Py_ssize_t i, n = len(values) kh_int64_t * table = kh_init_int64() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) - with nogil: - if take_last: + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: for i from n > i >=0: kh_put_int64(table, values[i], &ret) out[i] = ret == 0 - else: + elif keep == 'first': + with nogil: for i from 0 <= i < n: kh_put_int64(table, values[i], &ret) out[i] = ret == 0 - + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_int64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_int64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 kh_destroy_int64(table) return out diff --git a/pandas/lib.pyx b/pandas/lib.pyx index e839210fbbada..07f0c89535a77 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1348,35 +1348,47 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null): return result -def duplicated(ndarray[object] values, take_last=False): + +def duplicated(ndarray[object] values, object keep='first'): cdef: Py_ssize_t i, n - set seen = set() + dict seen = dict() object row n = len(values) cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) - if take_last: + if keep == 'last': for i from n > i >= 0: row = values[i] - if row in seen: result[i] = 1 else: - seen.add(row) + seen[row] = i result[i] = 0 - else: + elif keep == 'first': for i from 0 <= i < n: row = values[i] if row in seen: result[i] = 1 else: - seen.add(row) + seen[row] = i result[i] = 0 + elif keep is False: + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + result[seen[row]] = 1 + else: + seen[row] = i + result[i] = 0 + else: + raise ValueError('keep must be either "first", "last" or False') return result.view(np.bool_) + def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index d47e7dbe751c7..066b359d72b5c 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -683,6 +683,10 @@ def test_factorize(self): def test_duplicated_drop_duplicates(self): # GH 4060 + + import warnings + warnings.simplefilter('always') + for original in self.objs: if isinstance(original, Index): @@ -714,15 +718,36 @@ def test_duplicated_drop_duplicates(self): self.assertTrue(duplicated.dtype == bool) tm.assert_index_equal(idx.drop_duplicates(), original) - last_base = [False] * len(idx) - last_base[3] = True - last_base[5] = True - expected = np.array(last_base) - duplicated = idx.duplicated(take_last=True) + base = [False] * len(idx) + base[3] = True + base[5] = True + expected = np.array(base) + + duplicated = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(duplicated, expected) + self.assertTrue(duplicated.dtype == bool) + result = idx.drop_duplicates(keep='last') + tm.assert_index_equal(result, idx[~expected]) + + # deprecate take_last + with tm.assert_produces_warning(FutureWarning): + duplicated = idx.duplicated(take_last=True) + tm.assert_numpy_array_equal(duplicated, expected) + self.assertTrue(duplicated.dtype == bool) + with tm.assert_produces_warning(FutureWarning): + result = idx.drop_duplicates(take_last=True) + tm.assert_index_equal(result, idx[~expected]) + + base = [False] * len(original) + [True, True] + base[3] = True + base[5] = True + expected = np.array(base) + + duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) self.assertTrue(duplicated.dtype == bool) - tm.assert_index_equal(idx.drop_duplicates(take_last=True), - idx[~np.array(last_base)]) + result = idx.drop_duplicates(keep=False) + tm.assert_index_equal(result, idx[~expected]) with tm.assertRaisesRegexp(TypeError, "drop_duplicates\(\) got an unexpected keyword argument"): @@ -745,13 +770,29 @@ def test_duplicated_drop_duplicates(self): tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) - last_base = [False] * len(idx) - last_base[3] = True - last_base[5] = True - expected = Series(last_base, index=idx, name='a') - tm.assert_series_equal(s.duplicated(take_last=True), expected) - tm.assert_series_equal(s.drop_duplicates(take_last=True), - s[~np.array(last_base)]) + base = [False] * len(idx) + base[3] = True + base[5] = True + expected = Series(base, index=idx, name='a') + + tm.assert_series_equal(s.duplicated(keep='last'), expected) + tm.assert_series_equal(s.drop_duplicates(keep='last'), + s[~np.array(base)]) + + # deprecate take_last + with tm.assert_produces_warning(FutureWarning): + tm.assert_series_equal(s.duplicated(take_last=True), expected) + with tm.assert_produces_warning(FutureWarning): + tm.assert_series_equal(s.drop_duplicates(take_last=True), + s[~np.array(base)]) + base = [False] * len(original) + [True, True] + base[3] = True + base[5] = True + expected = Series(base, index=idx, name='a') + + tm.assert_series_equal(s.duplicated(keep=False), expected) + tm.assert_series_equal(s.drop_duplicates(keep=False), + s[~np.array(base)]) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 77ef5fecf22c9..72eea5162caa5 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7848,7 +7848,7 @@ def test_dropna_multiple_axes(self): inp.dropna(how='all', axis=(0, 1), inplace=True) assert_frame_equal(inp, expected) - def test_drop_duplicates(self): + def test_aaa_drop_duplicates(self): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], 'B': ['one', 'one', 'two', 'two', @@ -7861,10 +7861,21 @@ def test_drop_duplicates(self): expected = df[:2] assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', take_last=True) + result = df.drop_duplicates('AAA', keep='last') expected = df.ix[[6, 7]] assert_frame_equal(result, expected) + result = df.drop_duplicates('AAA', keep=False) + expected = df.ix[[]] + assert_frame_equal(result, expected) + self.assertEqual(len(result), 0) + + # deprecate take_last + with tm.assert_produces_warning(FutureWarning): + result = df.drop_duplicates('AAA', take_last=True) + expected = df.ix[[6, 7]] + assert_frame_equal(result, expected) + # multi column expected = df.ix[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(['AAA', 'B'])) @@ -7872,6 +7883,15 @@ def test_drop_duplicates(self): result = df.drop_duplicates(['AAA', 'B']) assert_frame_equal(result, expected) + result = df.drop_duplicates(('AAA', 'B'), keep='last') + expected = df.ix[[0, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), keep=False) + expected = df.ix[[0]] + assert_frame_equal(result, expected) + + # deprecate take_last result = df.drop_duplicates(('AAA', 'B'), take_last=True) expected = df.ix[[0, 5, 6, 7]] assert_frame_equal(result, expected) @@ -7884,10 +7904,53 @@ def test_drop_duplicates(self): expected = df2.drop_duplicates(['AAA', 'B']) assert_frame_equal(result, expected) + result = df2.drop_duplicates(keep='last') + expected = df2.drop_duplicates(['AAA', 'B'], keep='last') + assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep=False) + expected = df2.drop_duplicates(['AAA', 'B'], keep=False) + assert_frame_equal(result, expected) + + # deprecate take_last result = df2.drop_duplicates(take_last=True) expected = df2.drop_duplicates(['AAA', 'B'], take_last=True) assert_frame_equal(result, expected) + def test_drop_duplicates_for_take_all(self): + df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', + 'foo', 'bar', 'qux', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df.iloc[[0, 1, 2, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep='last') + expected = df.iloc[[2, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep=False) + expected = df.iloc[[2, 6]] + assert_frame_equal(result, expected) + + # multiple columns + result = df.drop_duplicates(['AAA', 'B']) + expected = df.iloc[[0, 1, 2, 3, 4, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep='last') + expected = df.iloc[[0, 1, 2, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep=False) + expected = df.iloc[[0, 1, 2, 6]] + assert_frame_equal(result, expected) + def test_drop_duplicates_deprecated_warning(self): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], @@ -7914,6 +7977,14 @@ def test_drop_duplicates_deprecated_warning(self): self.assertRaises(TypeError, df.drop_duplicates, kwargs={'subset': 'AAA', 'bad_arg': True}) + # deprecate take_last + # Raises warning + with tm.assert_produces_warning(FutureWarning): + result = df.drop_duplicates(take_last=False, subset='AAA') + assert_frame_equal(result, expected) + + self.assertRaises(ValueError, df.drop_duplicates, keep='invalid_name') + def test_drop_duplicates_tuple(self): df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], @@ -7927,6 +7998,16 @@ def test_drop_duplicates_tuple(self): expected = df[:2] assert_frame_equal(result, expected) + result = df.drop_duplicates(('AA', 'AB'), keep='last') + expected = df.ix[[6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), keep=False) + expected = df.ix[[]] # empty df + self.assertEqual(len(result), 0) + assert_frame_equal(result, expected) + + # deprecate take_last result = df.drop_duplicates(('AA', 'AB'), take_last=True) expected = df.ix[[6, 7]] assert_frame_equal(result, expected) @@ -7950,6 +8031,16 @@ def test_drop_duplicates_NA(self): expected = df.ix[[0, 2, 3]] assert_frame_equal(result, expected) + result = df.drop_duplicates('A', keep='last') + expected = df.ix[[1, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.ix[[]] # empty df + assert_frame_equal(result, expected) + self.assertEqual(len(result), 0) + + # deprecate take_last result = df.drop_duplicates('A', take_last=True) expected = df.ix[[1, 6, 7]] assert_frame_equal(result, expected) @@ -7959,6 +8050,15 @@ def test_drop_duplicates_NA(self): expected = df.ix[[0, 2, 3, 6]] assert_frame_equal(result, expected) + result = df.drop_duplicates(['A', 'B'], keep='last') + expected = df.ix[[1, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], keep=False) + expected = df.ix[[6]] + assert_frame_equal(result, expected) + + # deprecate take_last result = df.drop_duplicates(['A', 'B'], take_last=True) expected = df.ix[[1, 5, 6, 7]] assert_frame_equal(result, expected) @@ -7976,6 +8076,16 @@ def test_drop_duplicates_NA(self): expected = df[:2] assert_frame_equal(result, expected) + result = df.drop_duplicates('C', keep='last') + expected = df.ix[[3, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.ix[[]] # empty df + assert_frame_equal(result, expected) + self.assertEqual(len(result), 0) + + # deprecate take_last result = df.drop_duplicates('C', take_last=True) expected = df.ix[[3, 7]] assert_frame_equal(result, expected) @@ -7985,10 +8095,53 @@ def test_drop_duplicates_NA(self): expected = df.ix[[0, 1, 2, 4]] assert_frame_equal(result, expected) + result = df.drop_duplicates(['C', 'B'], keep='last') + expected = df.ix[[1, 3, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], keep=False) + expected = df.ix[[1]] + assert_frame_equal(result, expected) + + # deprecate take_last result = df.drop_duplicates(['C', 'B'], take_last=True) expected = df.ix[[1, 3, 6, 7]] assert_frame_equal(result, expected) + def test_drop_duplicates_NA_for_take_all(self): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'baz', 'bar', 'qux'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) + + # single column + result = df.drop_duplicates('A') + expected = df.iloc[[0, 2, 3, 5, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep='last') + expected = df.iloc[[1, 4, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.iloc[[5, 7]] + assert_frame_equal(result, expected) + + # nan + + # single column + result = df.drop_duplicates('C') + expected = df.iloc[[0, 1, 5, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep='last') + expected = df.iloc[[3, 5, 6, 7]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.iloc[[5, 6]] + assert_frame_equal(result, expected) + def test_drop_duplicates_inplace(self): orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], @@ -8004,6 +8157,20 @@ def test_drop_duplicates_inplace(self): result = df assert_frame_equal(result, expected) + df = orig.copy() + df.drop_duplicates('A', keep='last', inplace=True) + expected = orig.ix[[6, 7]] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', keep=False, inplace=True) + expected = orig.ix[[]] + result = df + assert_frame_equal(result, expected) + self.assertEqual(len(df), 0) + + # deprecate take_last df = orig.copy() df.drop_duplicates('A', take_last=True, inplace=True) expected = orig.ix[[6, 7]] @@ -8017,6 +8184,19 @@ def test_drop_duplicates_inplace(self): result = df assert_frame_equal(result, expected) + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep='last', inplace=True) + expected = orig.ix[[0, 5, 6, 7]] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep=False, inplace=True) + expected = orig.ix[[0]] + result = df + assert_frame_equal(result, expected) + + # deprecate take_last df = orig.copy() df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) expected = orig.ix[[0, 5, 6, 7]] @@ -8033,6 +8213,19 @@ def test_drop_duplicates_inplace(self): result = df2 assert_frame_equal(result, expected) + df2 = orig2.copy() + df2.drop_duplicates(keep='last', inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep='last') + result = df2 + assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep=False, inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep=False) + result = df2 + assert_frame_equal(result, expected) + + # deprecate take_last df2 = orig2.copy() df2.drop_duplicates(take_last=True, inplace=True) expected = orig2.drop_duplicates(['A', 'B'], take_last=True) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index c7418a5651ad7..d6e57e76d0ec9 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -4720,9 +4720,9 @@ def check(nlevels, with_nulls): labels = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, labels=labels) - for take_last in [False, True]: - left = mi.duplicated(take_last=take_last) - right = pd.lib.duplicated(mi.values, take_last=take_last) + for keep in ['first', 'last', False]: + left = mi.duplicated(keep=keep) + right = pd.lib.duplicated(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 65ba5fd036a35..fbe4eefabe02d 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2135,6 +2135,21 @@ def test_duplicated_drop_duplicates(self): expected = MultiIndex.from_arrays(([1, 2, 3, 2 ,3], [1, 1, 1, 2, 2])) tm.assert_index_equal(idx.drop_duplicates(), expected) + expected = np.array([True, False, False, False, False, False]) + duplicated = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(duplicated, expected) + self.assertTrue(duplicated.dtype == bool) + expected = MultiIndex.from_arrays(([2, 3, 1, 2 ,3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep='last'), expected) + + expected = np.array([True, False, False, True, False, False]) + duplicated = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(duplicated, expected) + self.assertTrue(duplicated.dtype == bool) + expected = MultiIndex.from_arrays(([2, 3, 2 ,3], [1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) + + # deprecate take_last expected = np.array([True, False, False, False, False, False]) duplicated = idx.duplicated(take_last=True) tm.assert_numpy_array_equal(duplicated, expected) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 66a38cd858846..31843616956f6 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4782,29 +4782,63 @@ def test_axis_alias(self): self.assertEqual(s._get_axis_name('rows'), 'index') def test_drop_duplicates(self): - s = Series([1, 2, 3, 3]) + # check both int and object + for s in [Series([1, 2, 3, 3]), Series(['1', '2', '3', '3'])]: + expected = Series([False, False, False, True]) + assert_series_equal(s.duplicated(), expected) + assert_series_equal(s.drop_duplicates(), s[~expected]) + sc = s.copy() + sc.drop_duplicates(inplace=True) + assert_series_equal(sc, s[~expected]) - result = s.duplicated() - expected = Series([False, False, False, True]) - assert_series_equal(result, expected) + expected = Series([False, False, True, False]) + assert_series_equal(s.duplicated(keep='last'), expected) + assert_series_equal(s.drop_duplicates(keep='last'), s[~expected]) + sc = s.copy() + sc.drop_duplicates(keep='last', inplace=True) + assert_series_equal(sc, s[~expected]) + # deprecate take_last + assert_series_equal(s.duplicated(take_last=True), expected) + assert_series_equal(s.drop_duplicates(take_last=True), s[~expected]) + sc = s.copy() + sc.drop_duplicates(take_last=True, inplace=True) + assert_series_equal(sc, s[~expected]) - result = s.duplicated(take_last=True) - expected = Series([False, False, True, False]) - assert_series_equal(result, expected) + expected = Series([False, False, True, True]) + assert_series_equal(s.duplicated(keep=False), expected) + assert_series_equal(s.drop_duplicates(keep=False), s[~expected]) + sc = s.copy() + sc.drop_duplicates(keep=False, inplace=True) + assert_series_equal(sc, s[~expected]) + + for s in [Series([1, 2, 3, 5, 3, 2, 4]), + Series(['1', '2', '3', '5', '3', '2', '4'])]: + expected = Series([False, False, False, False, True, True, False]) + assert_series_equal(s.duplicated(), expected) + assert_series_equal(s.drop_duplicates(), s[~expected]) + sc = s.copy() + sc.drop_duplicates(inplace=True) + assert_series_equal(sc, s[~expected]) - result = s.drop_duplicates() - expected = s[[True, True, True, False]] - assert_series_equal(result, expected) - sc = s.copy() - sc.drop_duplicates(inplace=True) - assert_series_equal(sc, expected) + expected = Series([False, True, True, False, False, False, False]) + assert_series_equal(s.duplicated(keep='last'), expected) + assert_series_equal(s.drop_duplicates(keep='last'), s[~expected]) + sc = s.copy() + sc.drop_duplicates(keep='last', inplace=True) + assert_series_equal(sc, s[~expected]) + # deprecate take_last + assert_series_equal(s.duplicated(take_last=True), expected) + assert_series_equal(s.drop_duplicates(take_last=True), s[~expected]) + sc = s.copy() + sc.drop_duplicates(take_last=True, inplace=True) + assert_series_equal(sc, s[~expected]) - result = s.drop_duplicates(take_last=True) - expected = s[[True, True, False, True]] - assert_series_equal(result, expected) - sc = s.copy() - sc.drop_duplicates(take_last=True, inplace=True) - assert_series_equal(sc, expected) + expected = Series([False, True, True, False, True, True, False]) + assert_series_equal(s.duplicated(keep=False), expected) + assert_series_equal(s.drop_duplicates(keep=False), s[~expected]) + sc = s.copy() + sc.drop_duplicates(keep=False, inplace=True) + assert_series_equal(sc, s[~expected]) def test_sort(self): ts = self.ts.copy() diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 035b3ac07342d..f10d541a7e23b 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -275,10 +275,18 @@ def test_duplicated_with_nas(): expected = [False, False, False, True, False, True] assert(np.array_equal(result, expected)) - result = lib.duplicated(keys, take_last=True) + result = lib.duplicated(keys, keep='first') + expected = [False, False, False, True, False, True] + assert(np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep='last') expected = [True, False, True, False, False, False] assert(np.array_equal(result, expected)) + result = lib.duplicated(keys, keep=False) + expected = [True, False, True, True, False, True] + assert(np.array_equal(result, expected)) + keys = np.empty(8, dtype=object) for i, t in enumerate(zip([0, 0, nan, nan] * 2, [0, nan, 0, nan] * 2)): keys[i] = t @@ -289,10 +297,14 @@ def test_duplicated_with_nas(): expected = falses + trues assert(np.array_equal(result, expected)) - result = lib.duplicated(keys, take_last=True) + result = lib.duplicated(keys, keep='last') expected = trues + falses assert(np.array_equal(result, expected)) + result = lib.duplicated(keys, keep=False) + expected = trues + trues + assert(np.array_equal(result, expected)) + def test_maybe_booleans_to_slice(): arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8)