diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 74b21c21252ec..1161656731f88 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -153,10 +153,10 @@ Set logic on the other axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~ When gluing together multiple DataFrames, you have a choice of how to handle -the other axes (other than the one being concatenated). This can be done in +the other axes (other than the one being concatenated). This can be done in the following three ways: -- Take the (sorted) union of them all, ``join='outer'``. This is the default +- Take the union of them all, ``join='outer'``. This is the default option as it results in zero information loss. - Take the intersection, ``join='inner'``. - Use a specific index, as passed to the ``join_axes`` argument. @@ -167,10 +167,10 @@ behavior: .. ipython:: python df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], - 'D': ['D2', 'D3', 'D6', 'D7'], - 'F': ['F2', 'F3', 'F6', 'F7']}, - index=[2, 3, 6, 7]) - result = pd.concat([df1, df4], axis=1) + 'D': ['D2', 'D3', 'D6', 'D7'], + 'F': ['F2', 'F3', 'F6', 'F7']}, + index=[2, 3, 6, 7]) + result = pd.concat([df1, df4], axis=1, sort=False) .. ipython:: python @@ -181,8 +181,16 @@ behavior: labels=['df1', 'df4'], vertical=False); plt.close('all'); -Note that the row indexes have been unioned and sorted. Here is the same thing -with ``join='inner'``: +.. warning:: + + .. versionchanged:: 0.23.0 + + The default behavior with ``join='outer'`` is to sort the other axis + (columns in this case). In a future version of pandas, the default will + be to not sort. We specified ``sort=False`` to opt in to the new + behavior now. + +Here is the same thing with ``join='inner'``: .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 2cf8e3cedf742..fb9e14080e4f6 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -639,6 +639,36 @@ Returning a ``Series`` allows one to control the exact return structure and colu df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']), axis=1) +.. _whatsnew_0230.api_breaking.concat: + +Concatenation will no longer sort +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In a future version of pandas :func:`pandas.concat` will no longer sort the non-concatenation axis when it is not already aligned. +The current behavior is the same as the previous (sorting), but now a warning is issued when ``sort`` is not specified and the non-concatenation axis is not aligned (:issue:`4588`). + +.. ipython:: python + :okwarning: + + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [4, 5]}) + + pd.concat([df1, df2]) + +To keep the previous behavior (sorting) and silence the warning, pass ``sort=True`` + +.. ipython:: python + + pd.concat([df1, df2], sort=True) + +To accept the future behavior (no sorting), pass ``sort=False`` + +.. ipython + + pd.concat([df1, df2], sort=False) + +Note that this change also applies to :meth:`DataFrame.append`, which has also received a ``sort`` keyword for controlling this behavior. + .. _whatsnew_0230.api_breaking.build_changes: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 30521760327b4..ae9d240afcb93 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list(list lists): +def fast_unique_multiple_list(list lists, bint sort=True): cdef: list buf Py_ssize_t k = len(lists) @@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists): if val not in table: table[val] = stub uniques.append(val) - try: - uniques.sort() - except Exception: - pass + if sort: + try: + uniques.sort() + except Exception: + pass return uniques diff --git a/pandas/core/base.py b/pandas/core/base.py index 9ca1c8bea4db7..2f25a9ce41369 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -507,7 +507,7 @@ def is_any_frame(): for r in compat.itervalues(result)) if isinstance(result, list): - return concat(result, keys=keys, axis=1), True + return concat(result, keys=keys, axis=1, sort=True), True elif is_any_frame(): # we have a dict of DataFrames diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d7efd777f4176..d475d8b944575 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6038,7 +6038,8 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False): + def append(self, other, ignore_index=False, + verify_integrity=False, sort=None): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -6051,6 +6052,14 @@ def append(self, other, ignore_index=False, verify_integrity=False): If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. + sort : boolean, default None + Sort columns if the columns of `self` and `other` are not aligned. + The default sorting is deprecated and will change to not-sorting + in a future version of pandas. Explicitly pass ``sort=True`` to + silence the warning and sort. Explicitly pass ``sort=False`` to + silence the warning and not sort. + + .. versionadded:: 0.23.0 Returns ------- @@ -6162,7 +6171,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): else: to_concat = [self, other] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + verify_integrity=verify_integrity, + sort=sort) def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): @@ -7481,7 +7491,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): from pandas.core.index import _get_objs_combined_axis if columns is None: - columns = _get_objs_combined_axis(data) + columns = _get_objs_combined_axis(data, sort=False) indexer_cache = {} diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8c20d62117e25..4132d8e69704a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1098,7 +1098,8 @@ def reset_identity(values): group_names = self.grouper.names result = concat(values, axis=self.axis, keys=group_keys, - levels=group_levels, names=group_names) + levels=group_levels, names=group_names, + sort=False) else: # GH5610, returns a MI, with the first level being a diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 2e5ec8b554ce7..f9501cd2f9ddf 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,3 +1,6 @@ +import textwrap +import warnings + from pandas.core.indexes.base import (Index, _new_Index, _ensure_index, @@ -17,6 +20,16 @@ from pandas._libs import lib from pandas._libs.tslib import NaT +_sort_msg = textwrap.dedent("""\ +Sorting because non-concatenation axis is not aligned. A future version +of pandas will change to not sort by default. + +To accept the future behavior, pass 'sort=True'. + +To retain the current behavior and silence the warning, pass sort=False +""") + + # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', @@ -31,33 +44,40 @@ '_all_indexes_same'] -def _get_objs_combined_axis(objs, intersect=False, axis=0): +def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): # Extract combined index: return intersection or union (depending on the # value of "intersect") of indexes on given axis, or None if all objects # lack indexes (e.g. they are numpy arrays) obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, '_get_axis')] if obs_idxes: - return _get_combined_index(obs_idxes, intersect=intersect) + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_combined_index(indexes, intersect=False): +def _get_combined_index(indexes, intersect=False, sort=False): # TODO: handle index names! indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: - return Index([]) - if len(indexes) == 1: - return indexes[0] - if intersect: + index = Index([]) + elif len(indexes) == 1: + index = indexes[0] + elif intersect: index = indexes[0] for other in indexes[1:]: index = index.intersection(other) - return index - union = _union_indexes(indexes) - return _ensure_index(union) + else: + index = _union_indexes(indexes, sort=sort) + index = _ensure_index(index) + + if sort: + try: + index = index.sort_values() + except TypeError: + pass + return index -def _union_indexes(indexes): +def _union_indexes(indexes, sort=True): if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') if len(indexes) == 1: @@ -74,7 +94,8 @@ def conv(i): i = i.tolist() return i - return Index(lib.fast_unique_multiple_list([conv(i) for i in inds])) + return Index( + lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) if kind == 'special': result = indexes[0] @@ -89,13 +110,19 @@ def conv(i): index = indexes[0] for other in indexes[1:]: if not index.equals(other): + + if sort is None: + # TODO: remove once pd.concat sort default changes + warnings.warn(_sort_msg, FutureWarning, stacklevel=8) + sort = True + return _unique_indices(indexes) name = _get_consensus_names(indexes)[0] if name != index.name: index = index._shallow_copy(name=name) return index - else: + else: # kind='list' return _unique_indices(indexes) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index e08d0a7368ccb..16e64192fdb20 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1499,8 +1499,11 @@ def _extract_axis(self, data, axis=0, intersect=False): raw_lengths.append(v.shape[axis]) if have_frames: + # we want the "old" behavior here, of sorting only + # 1. we're doing a union (intersect=False) + # 2. the indices are not aligned. index = _get_objs_combined_axis(data.values(), axis=axis, - intersect=intersect) + intersect=intersect, sort=None) if have_raw_arrays: lengths = list(set(raw_lengths)) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 6e564975f34cd..b36e9b8d900fd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -20,7 +20,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - copy=True): + sort=None, copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -60,6 +60,19 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, verify_integrity : boolean, default False Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation + sort : boolean, default None + Sort non-concatenation axis if it is not already aligned when `join` + is 'outer'. The current default of sorting is deprecated and will + change to not-sorting in a future version of pandas. + + Explicitly pass ``sort=True`` to silence the warning and sort. + Explicitly pass ``sort=False`` to silence the warning and not sort. + + This has no effect when ``join='inner'``, which already preserves + the order of the non-concatenation axis. + + .. versionadded:: 0.23.0 + copy : boolean, default True If False, do not copy data unnecessarily @@ -209,7 +222,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity, - copy=copy) + copy=copy, sort=sort) return op.get_result() @@ -220,7 +233,8 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True): + ignore_index=False, verify_integrity=False, copy=True, + sort=False): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -355,6 +369,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.keys = keys self.names = names or getattr(keys, 'names', None) self.levels = levels + self.sort = sort self.ignore_index = ignore_index self.verify_integrity = verify_integrity @@ -447,7 +462,8 @@ def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) try: return _get_objs_combined_axis(self.objs, axis=data_axis, - intersect=self.intersect) + intersect=self.intersect, + sort=self.sort) except IndexError: types = [type(x).__name__ for x in self.objs] raise TypeError("Cannot concatenate list of {types}" diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 74a9b59d3194a..96f8a53b4d253 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -437,7 +437,8 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') - common_idx = _get_objs_combined_axis(index + columns, intersect=True) + common_idx = _get_objs_combined_axis(index + columns, intersect=True, + sort=False) data = {} data.update(zip(rownames, index)) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index e82faaeef2986..15ca65395e4fc 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -96,7 +96,7 @@ def test_append_series_dict(self): result = df.append(series[::-1][:3], ignore_index=True) expected = df.append(DataFrame({0: series[::-1][:3]}).T, - ignore_index=True) + ignore_index=True, sort=True) assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set @@ -119,8 +119,8 @@ def test_append_list_of_series_dicts(self): # different columns dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] - result = df.append(dicts, ignore_index=True) - expected = df.append(DataFrame(dicts), ignore_index=True) + result = df.append(dicts, ignore_index=True, sort=True) + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) assert_frame_equal(result, expected) def test_append_empty_dataframe(self): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 47b7d60e3b6e8..6dd38187f7277 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1071,6 +1071,17 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient='index') tm.assert_frame_equal(result, expected) + def test_constructor_list_of_series_aligned_index(self): + series = [pd.Series(i, index=['b', 'a', 'c'], name=str(i)) + for i in range(3)] + result = pd.DataFrame(series) + expected = pd.DataFrame({'b': [0, 1, 2], + 'a': [0, 1, 2], + 'c': [0, 1, 2]}, + columns=['b', 'a', 'c'], + index=['0', '1', '2']) + tm.assert_frame_equal(result, expected) + def test_constructor_list_of_derived_dicts(self): class CustomDict(dict): pass diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index f1178d44dbfe0..bfc74db73b813 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -629,7 +629,8 @@ def test_iloc_non_unique_indexing(self): new_list.append(s * 3) expected = DataFrame(new_list) - expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])]) + expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])], + sort=True) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df2.loc[idx] tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index f95f493c66043..3c7a7f070805d 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -156,8 +156,9 @@ def f(): df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) - expected = pd.concat([df_orig, DataFrame( - {'A': 7}, index=[dates[-1] + 1])]) + expected = pd.concat([df_orig, + DataFrame({'A': 7}, index=[dates[-1] + 1])], + sort=True) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index dbf7c7f100b0e..f3827ac251cf0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -826,7 +826,7 @@ def test_validation(self): # Dups on left left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']}, - index=[3])) + index=[3]), sort=True) merge(left_w_dups, right, left_index=True, right_index=True, validate='many_to_one') @@ -1286,7 +1286,7 @@ def test_join_multi_levels(self): index=MultiIndex.from_tuples( [(4, np.nan)], names=['household_id', 'asset_id']))) - ], axis=0).reindex(columns=expected.columns)) + ], axis=0, sort=True).reindex(columns=expected.columns)) assert_frame_equal(result, expected) # invalid cases diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 640d09f3587fb..57af67422d65f 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -21,6 +21,22 @@ import pytest +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param + + +@pytest.fixture(params=[True, False, None]) +def sort_with_none(request): + """Boolean sort keyword for concat and DataFrame.append. + + Includes the default of None + """ + # TODO: Replace with sort once keyword changes. + return request.param + + class ConcatenateBase(object): def setup_method(self, method): @@ -716,7 +732,7 @@ def test_concat_categorical_empty(self): class TestAppend(ConcatenateBase): - def test_append(self): + def test_append(self, sort): begin_index = self.frame.index[:5] end_index = self.frame.index[5:] @@ -727,10 +743,10 @@ def test_append(self): tm.assert_almost_equal(appended['A'], self.frame['A']) del end_frame['A'] - partial_appended = begin_frame.append(end_frame) + partial_appended = begin_frame.append(end_frame, sort=sort) assert 'A' in partial_appended - partial_appended = end_frame.append(begin_frame) + partial_appended = end_frame.append(begin_frame, sort=sort) assert 'A' in partial_appended # mixed type handling @@ -738,8 +754,9 @@ def test_append(self): tm.assert_frame_equal(appended, self.mixed_frame) # what to test here - mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) - mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) + mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=sort) + mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:], + sort=sort) # all equal except 'foo' column tm.assert_frame_equal( @@ -769,10 +786,10 @@ def test_append(self): result = df.append(row) tm.assert_frame_equal(result, expected) - def test_append_length0_frame(self): + def test_append_length0_frame(self, sort): df = DataFrame(columns=['A', 'B', 'C']) df3 = DataFrame(index=[0, 1], columns=['A', 'B']) - df5 = df.append(df3) + df5 = df.append(df3, sort=sort) expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) assert_frame_equal(df5, expected) @@ -793,7 +810,33 @@ def test_append_records(self): expected = DataFrame(np.concatenate((arr1, arr2))) assert_frame_equal(result, expected) - def test_append_different_columns(self): + # rewrite sort fixture, since we also want to test default of None + def test_append_sorts(self, sort_with_none): + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3]) + + if sort_with_none is None: + # only warn if not explicitly specified + # don't check stacklevel since its set for concat, and append + # has an extra stack. + ctx = tm.assert_produces_warning(FutureWarning, + check_stacklevel=False) + else: + ctx = tm.assert_produces_warning(None) + + with ctx: + result = df1.append(df2, sort=sort_with_none) + + # for None / True + expected = pd.DataFrame({"b": [1, 2, None, None], + "a": [1, 2, 1, 2], + "c": [None, None, 3, 4]}, + columns=['a', 'b', 'c']) + if sort_with_none is False: + expected = expected[['b', 'a', 'c']] + tm.assert_frame_equal(result, expected) + + def test_append_different_columns(self, sort): df = DataFrame({'bools': np.random.randn(10) > 0, 'ints': np.random.randint(0, 10, 10), 'floats': np.random.randn(10), @@ -802,11 +845,11 @@ def test_append_different_columns(self): a = df[:5].loc[:, ['bools', 'ints', 'floats']] b = df[5:].loc[:, ['strings', 'ints', 'floats']] - appended = a.append(b) + appended = a.append(b, sort=sort) assert isna(appended['strings'][0:4]).all() assert isna(appended['bools'][5:]).all() - def test_append_many(self): + def test_append_many(self, sort): chunks = [self.frame[:5], self.frame[5:10], self.frame[10:15], self.frame[15:]] @@ -815,7 +858,7 @@ def test_append_many(self): chunks[-1] = chunks[-1].copy() chunks[-1]['foo'] = 'bar' - result = chunks[0].append(chunks[1:]) + result = chunks[0].append(chunks[1:], sort=sort) tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame) assert (result['foo'][15:] == 'bar').all() assert result['foo'][:15].isna().all() @@ -923,7 +966,7 @@ def test_append_different_columns_types_raises( with pytest.raises(TypeError): df.append(ser) - def test_append_dtype_coerce(self): + def test_append_dtype_coerce(self, sort): # GH 4993 # appending with datetime will incorrectly convert datetime64 @@ -946,16 +989,22 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 2, 0, 0), dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], axis=1) - result = df1.append(df2, ignore_index=True) + name='start_time')], + axis=1, sort=sort) + result = df1.append(df2, ignore_index=True, sort=sort) + if sort: + expected = expected[['end_time', 'start_time']] + else: + expected = expected[['start_time', 'end_time']] + assert_frame_equal(result, expected) - def test_append_missing_column_proper_upcast(self): + def test_append_missing_column_proper_upcast(self, sort): df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) df2 = DataFrame({'B': np.array([True, False, True, False], dtype=bool)}) - appended = df1.append(df2, ignore_index=True) + appended = df1.append(df2, ignore_index=True, sort=sort) assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' @@ -1043,7 +1092,7 @@ def test_concat_keys_specific_levels(self): Index(level, name='group_key')) assert result.columns.names[0] == 'group_key' - def test_concat_dataframe_keys_bug(self): + def test_concat_dataframe_keys_bug(self, sort): t1 = DataFrame({ 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], name='id'))}) @@ -1051,7 +1100,7 @@ def test_concat_dataframe_keys_bug(self): 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2']) + result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=sort) assert list(result.columns) == [('t1', 'value'), ('t2', 'value')] def test_concat_series_partial_columns_names(self): @@ -1097,7 +1146,7 @@ def test_concat_dict(self): expected = concat([frames[k] for k in keys], keys=keys) tm.assert_frame_equal(result, expected) - def test_concat_ignore_index(self): + def test_concat_ignore_index(self, sort): frame1 = DataFrame({"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}) @@ -1105,7 +1154,8 @@ def test_concat_ignore_index(self): frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) - v1 = concat([frame1, frame2], axis=1, ignore_index=True) + v1 = concat([frame1, frame2], axis=1, + ignore_index=True, sort=sort) nan = np.nan expected = DataFrame([[nan, nan, nan, 4.3], @@ -1113,6 +1163,8 @@ def test_concat_ignore_index(self): ['b', 2, 3.2, 2.2], ['c', 3, 1.2, nan]], index=Index(["q", "x", "y", "z"])) + if not sort: + expected = expected.loc[['x', 'y', 'z', 'q']] tm.assert_frame_equal(v1, expected) @@ -1309,16 +1361,16 @@ def test_dups_index(self): result = df.append(df) assert_frame_equal(result, expected) - def test_with_mixed_tuples(self): + def test_with_mixed_tuples(self, sort): # 10697 # columns have mixed tuples, so handle properly df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2)) df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) # it works - concat([df1, df2]) + concat([df1, df2], sort=sort) - def test_handle_empty_objects(self): + def test_handle_empty_objects(self, sort): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) baz = df[:5].copy() @@ -1326,7 +1378,7 @@ def test_handle_empty_objects(self): empty = df[5:5] frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0) + concatted = concat(frames, axis=0, sort=sort) expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo']) expected['foo'] = expected['foo'].astype('O') @@ -1478,7 +1530,7 @@ def test_panel_concat_other_axes(self): expected.loc['ItemC', :, :2] = 'baz' tm.assert_panel_equal(result, expected) - def test_panel_concat_buglet(self): + def test_panel_concat_buglet(self, sort): with catch_warnings(record=True): # #2257 def make_panel(): @@ -1503,7 +1555,7 @@ def df(): panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) # it works! - concat([panel1, panel3], axis=1, verify_integrity=True) + concat([panel1, panel3], axis=1, verify_integrity=True, sort=sort) def test_concat_series(self): @@ -1528,7 +1580,7 @@ def test_concat_series(self): expected.index = exp_index tm.assert_series_equal(result, expected) - def test_concat_series_axis1(self): + def test_concat_series_axis1(self, sort=sort): ts = tm.makeTimeSeries() pieces = [ts[:-2], ts[2:], ts[2:-2]] @@ -1557,7 +1609,7 @@ def test_concat_series_axis1(self): # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1) + result = concat([s, s2], axis=1, sort=sort) expected = DataFrame({'A': s, 'B': s2}) assert_frame_equal(result, expected) @@ -2043,7 +2095,7 @@ def test_categorical_concat_dtypes(self): expected = Series([True, False, False], index=index) tm.assert_series_equal(result, expected) - def test_categorical_concat(self): + def test_categorical_concat(self, sort): # See GH 10177 df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3), columns=["a", "b", "c"]) @@ -2054,7 +2106,7 @@ def test_categorical_concat(self): cat_values = ["one", "one", "two", "one", "two", "two", "one"] df2['h'] = Series(Categorical(cat_values)) - res = pd.concat((df1, df2), axis=0, ignore_index=True) + res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -2162,10 +2214,15 @@ def test_concat_order(self): dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) for i in range(100)] - result = pd.concat(dfs).columns - expected = dfs[0].columns + + result = pd.concat(dfs, sort=True).columns + if PY2: - expected = expected.sort_values() + # Different sort order between incomparable objects between + # python 2 and python3 via Index.union. + expected = dfs[1].columns + else: + expected = dfs[0].columns tm.assert_index_equal(result, expected) def test_concat_datetime_timezone(self): @@ -2249,3 +2306,98 @@ def test_concat_empty_and_non_empty_series_regression(): expected = s1 result = pd.concat([s1, s2]) tm.assert_series_equal(result, expected) + + +def test_concat_sorts_columns(sort_with_none): + # GH-4588 + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]}) + + # for sort=True/None + expected = pd.DataFrame({"a": [1, 2, 3, 4], + "b": [1, 2, None, None], + "c": [None, None, 5, 6]}, + columns=['a', 'b', 'c']) + + if sort_with_none is False: + expected = expected[['b', 'a', 'c']] + + if sort_with_none is None: + # only warn if not explicitly specified + ctx = tm.assert_produces_warning(FutureWarning) + else: + ctx = tm.assert_produces_warning(None) + + # default + with ctx: + result = pd.concat([df1, df2], ignore_index=True, sort=sort_with_none) + tm.assert_frame_equal(result, expected) + + +def test_concat_sorts_index(sort_with_none): + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b']) + df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b']) + + # For True/None + expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]}, + index=['a', 'b', 'c'], + columns=['a', 'b']) + if sort_with_none is False: + expected = expected.loc[['c', 'a', 'b']] + + if sort_with_none is None: + # only warn if not explicitly specified + ctx = tm.assert_produces_warning(FutureWarning) + else: + ctx = tm.assert_produces_warning(None) + + # Warn and sort by default + with ctx: + result = pd.concat([df1, df2], axis=1, sort=sort_with_none) + tm.assert_frame_equal(result, expected) + + +def test_concat_inner_sort(sort_with_none): + # https://github.com/pandas-dev/pandas/pull/20613 + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, + columns=['b', 'a', 'c']) + df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4]) + + with tm.assert_produces_warning(None): + # unset sort should *not* warn for inner join + # since that never sorted + result = pd.concat([df1, df2], sort=sort_with_none, + join='inner', + ignore_index=True) + + expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, + columns=['b', 'a']) + if sort_with_none is True: + expected = expected[['a', 'b']] + tm.assert_frame_equal(result, expected) + + +def test_concat_aligned_sort(): + # GH-4588 + df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]}, + columns=['c', 'b', 'a']) + result = pd.concat([df, df], sort=True, ignore_index=True) + expected = pd.DataFrame({'a': [5, 6, 5, 6], 'b': [3, 4, 3, 4], + 'c': [1, 2, 1, 2]}, + columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + result = pd.concat([df, df[['c', 'b']]], join='inner', sort=True, + ignore_index=True) + expected = expected[['b', 'c']] + tm.assert_frame_equal(result, expected) + + +def test_concat_aligned_sort_does_not_raise(): + # GH-4588 + # We catch TypeErrors from sorting internally and do not re-raise. + df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a']) + expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, + columns=[1, 'a']) + result = pd.concat([df, df], ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1004b40bfb4c1..db287a719ae1e 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1724,3 +1724,15 @@ def test_crosstab_tuple_name(self, names): result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected) + + def test_crosstab_unsorted_order(self): + df = pd.DataFrame({"b": [3, 1, 2], 'a': [5, 4, 6]}, + index=['C', 'A', 'B']) + result = pd.crosstab(df.index, [df.b, df.a]) + e_idx = pd.Index(['A', 'B', 'C'], name='row_0') + e_columns = pd.MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], + names=['b', 'a']) + expected = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1]], + index=e_idx, + columns=e_columns) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 540933cb90be2..9cc615e15564f 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -629,10 +629,31 @@ def test_append(self): a = self.frame.iloc[:5, :3] b = self.frame.iloc[5:] - appended = a.append(b) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # Stacklevel is set for pd.concat, not append + appended = a.append(b) tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) + a = a[['B', 'C', 'A']].head(2) + b = b.head(2) + + expected = pd.SparseDataFrame({ + "B": [0., 1, None, 3], + "C": [0., 1, 5, 6], + "A": [None, None, 2, 3], + "D": [None, None, 5, None], + }, index=a.index | b.index, columns=['B', 'C', 'A', 'D']) + with tm.assert_produces_warning(None): + appended = a.append(b, sort=False) + + tm.assert_frame_equal(appended, expected) + + with tm.assert_produces_warning(None): + appended = a.append(b, sort=True) + + tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']]) + def test_astype(self): sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], dtype=np.int64), diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 70fd1da529d46..9e392457edbc3 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -202,17 +202,29 @@ def test_concat_different_fill_value(self): exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) + def test_concat_different_columns_sort_warns(self): + sparse = self.dense1.to_sparse() + sparse3 = self.dense3.to_sparse() + + with tm.assert_produces_warning(FutureWarning): + res = pd.concat([sparse, sparse3]) + with tm.assert_produces_warning(FutureWarning): + exp = pd.concat([self.dense1, self.dense3]) + + exp = exp.to_sparse() + tm.assert_sp_frame_equal(res, exp) + def test_concat_different_columns(self): # fill_value = np.nan sparse = self.dense1.to_sparse() sparse3 = self.dense3.to_sparse() - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]).to_sparse() + res = pd.concat([sparse, sparse3], sort=True) + exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse() tm.assert_sp_frame_equal(res, exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]).to_sparse() + res = pd.concat([sparse3, sparse], sort=True) + exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse() exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) @@ -220,13 +232,15 @@ def test_concat_different_columns(self): sparse = self.dense1.to_sparse(fill_value=0) sparse3 = self.dense3.to_sparse(fill_value=0) - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]).to_sparse(fill_value=0) + res = pd.concat([sparse, sparse3], sort=True) + exp = (pd.concat([self.dense1, self.dense3], sort=True) + .to_sparse(fill_value=0)) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]).to_sparse(fill_value=0) + res = pd.concat([sparse3, sparse], sort=True) + exp = (pd.concat([self.dense3, self.dense1], sort=True) + .to_sparse(fill_value=0)) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) @@ -234,13 +248,13 @@ def test_concat_different_columns(self): sparse = self.dense1.to_sparse() sparse3 = self.dense3.to_sparse(fill_value=0) # each columns keeps its fill_value, thus compare in dense - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]) + res = pd.concat([sparse, sparse3], sort=True) + exp = pd.concat([self.dense1, self.dense3], sort=True) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]) + res = pd.concat([sparse3, sparse], sort=True) + exp = pd.concat([self.dense3, self.dense1], sort=True) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp)