From 69c51a81a72344f4a2b723382e9195554b941b28 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 1 Sep 2023 17:44:41 +0200 Subject: [PATCH 01/11] move implicit mindex warning deeper in the stack So that it is caught in more cases. --- xarray/core/coordinates.py | 30 +++++------------------------- xarray/core/indexes.py | 16 ++++++++++++++-- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index bebf9362532..3c5c983fc5d 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -951,31 +951,11 @@ def create_coords_with_default_indexes( indexes: dict[Hashable, Index] = {} variables: dict[Hashable, Variable] = {} - # promote any pandas multi-index in data_vars as coordinates - coords_promoted: dict[Hashable, Any] = {} - pd_mindex_keys: list[Hashable] = [] - - for k, v in all_variables.items(): - if isinstance(v, pd.MultiIndex): - coords_promoted[k] = v - pd_mindex_keys.append(k) - elif k in coords: - coords_promoted[k] = v - - if pd_mindex_keys: - pd_mindex_keys_fmt = ",".join([f"'{k}'" for k in pd_mindex_keys]) - emit_user_level_warning( - f"the `pandas.MultiIndex` object(s) passed as {pd_mindex_keys_fmt} coordinate(s) or " - "data variable(s) will no longer be implicitly promoted and wrapped into " - "multiple indexed coordinates in the future " - "(i.e., one coordinate for each multi-index level + one dimension coordinate). " - "If you want to keep this behavior, you need to first wrap it explicitly using " - "`mindex_coords = xarray.Coordinates.from_pandas_multiindex(mindex_obj, 'dim')` " - "and pass it as coordinates, e.g., `xarray.Dataset(coords=mindex_coords)`, " - "`dataset.assign_coords(mindex_coords)` or `dataarray.assign_coords(mindex_coords)`.", - FutureWarning, - ) - + coords_promoted: dict[Hashable, Any] = { + k: v + for k, v in all_variables.items() + if k in coords or isinstance(v, pd.MultiIndex) + } dataarray_coords: list[DataArrayCoordinates] = [] for name, obj in coords_promoted.items(): diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index b5e396963a1..6287c2f508c 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1335,12 +1335,13 @@ def rename(self, name_dict, dims_dict): def create_default_index_implicit( dim_variable: Variable, all_variables: Mapping | Iterable[Hashable] | None = None, + warn_multi_index: bool = True, ) -> tuple[PandasIndex, IndexVars]: """Create a default index from a dimension variable. Create a PandasMultiIndex if the given variable wraps a pandas.MultiIndex, otherwise create a PandasIndex (note that this will become obsolete once we - depreciate implicitly passing a pandas.MultiIndex as a coordinate). + deprecate implicitly passing a pandas.MultiIndex as a coordinate). """ if all_variables is None: @@ -1353,6 +1354,15 @@ def create_default_index_implicit( index: PandasIndex if isinstance(array, pd.MultiIndex): + if warn_multi_index: + # raise ValueError("no pd.MultiIndex please!") + emit_user_level_warning( + f"the `pandas.MultiIndex` object wrapped in variable {name} will no longer " + "be implicitly promoted into multiple indexed coordinates in the future. " + "If you want to pass it as coordinates, you need to first wrap it explicitly " + "using `xarray.Coordinates.from_pandas_multiindex()`.", + FutureWarning, + ) index = PandasMultiIndex(array, name) index_vars = index.create_variables() # check for conflict between level names and variable names @@ -1680,7 +1690,9 @@ def default_indexes( for name, var in coords.items(): if name in dims and var.ndim == 1: - index, index_vars = create_default_index_implicit(var, coords) + index, index_vars = create_default_index_implicit( + var, coords, warn_multi_index=False + ) if set(index_vars) <= coord_names: indexes.update({k: index for k in index_vars}) From 7889af9dc7727faa27f7a29c38f22e37578eccd1 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 1 Sep 2023 17:46:27 +0200 Subject: [PATCH 02/11] wip: 1st pass silencing warnings --- xarray/tests/test_dataset.py | 93 +++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e119cfe9bc6..908e4d60868 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -356,7 +356,8 @@ def test_repr_multiindex(self) -> None: mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("a_quite_long_level_name", "level_2") ) - data = Dataset({}, {"x": mindex}) + mindex_coords = Coordinates.from_pandas_multiindex(mindex, "x") + data = Dataset({}, mindex_coords) expected = dedent( """\ @@ -630,9 +631,14 @@ def test_constructor_with_coords(self) -> None: mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("level_1", "level_2") ) - with pytest.raises(ValueError, match=r"conflicting MultiIndex"): - Dataset({}, {"x": mindex, "y": mindex}) - Dataset({}, {"x": mindex, "level_1": range(4)}) + + with pytest.warns( + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted", + ): + with pytest.raises(ValueError, match=r"conflicting MultiIndex"): + Dataset({}, {"x": mindex, "y": mindex}) + Dataset({}, {"x": mindex, "level_1": range(4)}) def test_constructor_no_default_index(self) -> None: # explicitly passing a Coordinates object skips the creation of default index @@ -1604,19 +1610,20 @@ def test_sel_dataarray(self) -> None: def test_sel_dataarray_mindex(self) -> None: midx = pd.MultiIndex.from_product([list("abc"), [0, 1]], names=("one", "two")) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") mds = xr.Dataset( {"var": (("x", "y"), np.random.rand(6, 3))}, - coords={"x": midx, "y": range(3)}, + coords=midx_coords.merge({"y": range(3)}).coords, ) actual_isel = mds.isel(x=xr.DataArray(np.arange(3), dims="x")) - actual_sel = mds.sel(x=DataArray(midx[:3], dims="x")) + actual_sel = mds.sel(x=DataArray(np.array(midx[:3]), dims="x")) assert actual_isel["x"].dims == ("x",) assert actual_sel["x"].dims == ("x",) assert_identical(actual_isel, actual_sel) actual_isel = mds.isel(x=xr.DataArray(np.arange(3), dims="z")) - actual_sel = mds.sel(x=Variable("z", midx[:3])) + actual_sel = mds.sel(x=Variable("z", np.array(midx[:3]))) assert actual_isel["x"].dims == ("z",) assert actual_sel["x"].dims == ("z",) assert_identical(actual_isel, actual_sel) @@ -1724,7 +1731,8 @@ def test_sel_drop(self) -> None: def test_sel_drop_mindex(self) -> None: midx = pd.MultiIndex.from_arrays([["a", "a"], [1, 2]], names=("foo", "bar")) - data = Dataset(coords={"x": midx}) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + data = Dataset(coords=midx_coords) actual = data.sel(foo="a", drop=True) assert "foo" not in actual.coords @@ -1948,7 +1956,8 @@ def test_selection_multiindex(self) -> None: mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2], [-1, -2]], names=("one", "two", "three") ) - mdata = Dataset(data_vars={"var": ("x", range(8))}, coords={"x": mindex}) + mindex_coords = Coordinates.from_pandas_multiindex(mindex, "x") + mdata = Dataset(data_vars={"var": ("x", range(8))}, coords=mindex_coords) def test_sel( lab_indexer, pos_indexer, replaced_idx=False, renamed_dim=None @@ -2791,7 +2800,8 @@ def test_drop_indexes(self) -> None: # test index corrupted mindex = pd.MultiIndex.from_tuples([([1, 2]), ([3, 4])], names=["a", "b"]) - ds = Dataset(coords={"x": mindex}) + mindex_coords = Coordinates.from_pandas_multiindex(mindex, "x") + ds = Dataset(coords=mindex_coords) with pytest.raises(ValueError, match=".*would corrupt the following index.*"): ds.drop_indexes("a") @@ -3077,8 +3087,12 @@ def test_rename_dimension_coord_warnings(self) -> None: def test_rename_multiindex(self) -> None: mindex = pd.MultiIndex.from_tuples([([1, 2]), ([3, 4])], names=["a", "b"]) - original = Dataset({}, {"x": mindex}) - expected = Dataset({}, {"x": mindex.rename(["a", "c"])}) + original_coords = Coordinates.from_pandas_multiindex(mindex, "x") + original = Dataset({}, original_coords) + expected_coords = Coordinates.from_pandas_multiindex( + mindex.rename(["a", "c"]), "x" + ) + expected = Dataset({}, expected_coords) actual = original.rename({"b": "c"}) assert_identical(expected, actual) @@ -3188,10 +3202,17 @@ def test_swap_dims(self) -> None: assert_identical(expected, actual) # handle multiindex case - idx = pd.MultiIndex.from_arrays([list("aab"), list("yzz")], names=["y1", "y2"]) - original = Dataset({"x": [1, 2, 3], "y": ("x", idx), "z": 42}) - expected = Dataset({"z": 42}, {"x": ("y", [1, 2, 3]), "y": idx}) - actual = original.swap_dims({"x": "y"}) + midx = pd.MultiIndex.from_arrays([list("aab"), list("yzz")], names=["y1", "y2"]) + midx_coords = xr.Coordinates.from_pandas_multiindex(midx, "y") + + original = Dataset({"x": [1, 2, 3], "y": ("x", midx), "z": 42}) + expected_coords = midx_coords.assign({"x": ("y", [1, 2, 3])}) + expected = Dataset({"z": 42}, expected_coords) + with pytest.warns( + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted", + ): + actual = original.swap_dims({"x": "y"}) assert_identical(expected, actual) assert isinstance(actual.variables["y"], IndexVariable) assert isinstance(actual.variables["x"], Variable) @@ -3423,14 +3444,18 @@ def test_set_index_deindexed_coords(self) -> None: four = [3, 4, 3, 4] mindex_12 = pd.MultiIndex.from_arrays([one, two], names=["one", "two"]) + mindex_12_coords = Coordinates.from_pandas_multiindex(mindex_12, "x") mindex_34 = pd.MultiIndex.from_arrays([three, four], names=["three", "four"]) + mindex_34_coords = Coordinates.from_pandas_multiindex(mindex_34, "x") ds = xr.Dataset( - coords={"x": mindex_12, "three": ("x", three), "four": ("x", four)} + coords=mindex_12_coords.merge( + {"three": ("x", three), "four": ("x", four)} + ).coords ) actual = ds.set_index(x=["three", "four"]) expected = xr.Dataset( - coords={"x": mindex_34, "one": ("x", one), "two": ("x", two)} + coords=mindex_34_coords.merge({"one": ("x", one), "two": ("x", two)}).coords ) assert_identical(actual, expected) @@ -3482,7 +3507,8 @@ def test_reset_index_drop_convert( # check that multi-index dimension or level coordinates are dropped, converted # from IndexVariable to Variable or renamed to dimension as expected midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("foo", "bar")) - ds = xr.Dataset(coords={"x": midx}) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + ds = xr.Dataset(coords=midx_coords) reset = ds.reset_index(arg, drop=drop) for name in dropped: @@ -3496,7 +3522,8 @@ def test_reorder_levels(self) -> None: ds = create_test_multiindex() mindex = ds["x"].to_index() midx = mindex.reorder_levels(["level_2", "level_1"]) - expected = Dataset({}, coords={"x": midx}) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + expected = Dataset({}, coords=midx_coords) # check attrs propagated ds["level_1"].attrs["foo"] = "bar" @@ -3564,7 +3591,7 @@ def test_stack(self) -> None: exp_index = pd.MultiIndex.from_product([[0, 1], ["a", "b"]], names=["x", "y"]) expected = Dataset( data_vars={"b": ("z", [0, 1, 2, 3])}, - coords={"z": exp_index}, + coords=Coordinates.from_pandas_multiindex(exp_index, "z"), ) # check attrs propagated ds["x"].attrs["foo"] = "bar" @@ -3588,7 +3615,7 @@ def test_stack(self) -> None: exp_index = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=["y", "x"]) expected = Dataset( data_vars={"b": ("z", [0, 2, 1, 3])}, - coords={"z": exp_index}, + coords=Coordinates.from_pandas_multiindex(exp_index, "z"), ) expected["x"].attrs["foo"] = "bar" @@ -3619,9 +3646,10 @@ def test_stack_create_index(self, create_index, expected_keys) -> None: def test_stack_multi_index(self) -> None: # multi-index on a dimension to stack is discarded too midx = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=("lvl1", "lvl2")) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") ds = xr.Dataset( data_vars={"b": (("x", "y"), [[0, 1], [2, 3], [4, 5], [6, 7]])}, - coords={"x": midx, "y": [0, 1]}, + coords=midx_coords.merge({"y": [0, 1]}).coords, ) expected = Dataset( data_vars={"b": ("z", [0, 1, 2, 3, 4, 5, 6, 7])}, @@ -3648,7 +3676,7 @@ def test_stack_non_dim_coords(self) -> None: exp_index = pd.MultiIndex.from_product([[0, 1], ["a", "b"]], names=["xx", "y"]) expected = Dataset( data_vars={"b": ("z", [0, 1, 2, 3])}, - coords={"z": exp_index}, + coords=Coordinates.from_pandas_multiindex(exp_index, "z"), ) actual = ds.stack(z=["x", "y"]) @@ -3657,7 +3685,10 @@ def test_stack_non_dim_coords(self) -> None: def test_unstack(self) -> None: index = pd.MultiIndex.from_product([[0, 1], ["a", "b"]], names=["x", "y"]) - ds = Dataset(data_vars={"b": ("z", [0, 1, 2, 3])}, coords={"z": index}) + ds = Dataset( + data_vars={"b": ("z", [0, 1, 2, 3])}, + coords=Coordinates.from_pandas_multiindex(index, "z"), + ) expected = Dataset( {"b": (("x", "y"), [[0, 1], [2, 3]]), "x": [0, 1], "y": ["a", "b"]} ) @@ -3722,9 +3753,12 @@ def test_unstack_sparse(self) -> None: mindex = pd.MultiIndex.from_arrays( [np.arange(3), np.arange(3)], names=["a", "b"] ) + mindex_coords = Coordinates.from_pandas_multiindex(mindex, "z") ds_eye = Dataset( {"var": (("z", "foo", "bar"), np.ones((3, 4, 5)))}, - coords={"z": mindex, "foo": np.arange(4), "bar": np.arange(5)}, + coords=mindex_coords.merge( + {"foo": np.arange(4), "bar": np.arange(5)} + ).coords, ) actual3 = ds_eye.unstack(sparse=True, fill_value=0) assert isinstance(actual3["var"].data, sparse_array_type) @@ -6218,10 +6252,12 @@ def test_sortby(self) -> None: # test pandas.MultiIndex indices = (("b", 1), ("b", 0), ("a", 1), ("a", 0)) midx = pd.MultiIndex.from_tuples(indices, names=["one", "two"]) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") ds_midx = Dataset( { "A": DataArray( - [[1, 2], [3, 4], [5, 6], [7, 8]], [("x", midx), ("y", [1, 0])] + [[1, 2], [3, 4], [5, 6], [7, 8]], + coords=midx_coords.merge({"y": [1, 0]}).coords, ), "B": DataArray([[5, 6], [7, 8], [9, 10], [11, 12]], dims=["x", "y"]), } @@ -6230,11 +6266,12 @@ def test_sortby(self) -> None: midx_reversed = pd.MultiIndex.from_tuples( tuple(reversed(indices)), names=["one", "two"] ) + midx_reversed_coords = Coordinates.from_pandas_multiindex(midx_reversed, "x") expected = Dataset( { "A": DataArray( [[7, 8], [5, 6], [3, 4], [1, 2]], - [("x", midx_reversed), ("y", [1, 0])], + coords=midx_reversed_coords.merge({"y": [1, 0]}).coords, ), "B": DataArray([[11, 12], [9, 10], [7, 8], [5, 6]], dims=["x", "y"]), } From 60bbbe3cbf342d881c4ce4c32aa978805809fc5d Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 1 Sep 2023 17:47:05 +0200 Subject: [PATCH 03/11] refactor broadcast (full multi-index support) --- xarray/core/alignment.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 39ff878b56d..eb0e5aadda3 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -937,14 +937,21 @@ def reindex_like( def _get_broadcast_dims_map_common_coords(args, exclude): - common_coords = {} + from xarray.core.coordinates import Coordinates + + common_coords = Coordinates() dims_map = {} for arg in args: for dim in arg.dims: if dim not in common_coords and dim not in exclude: dims_map[dim] = arg.sizes[dim] if dim in arg._indexes: - common_coords.update(arg.xindexes.get_all_coords(dim)) + idx = arg._indexes[dim] + idx_vars = arg.xindexes.get_all_coords(dim) + coords = Coordinates._construct_direct( + idx_vars, {k: idx for k in idx_vars} + ) + common_coords.update(coords) return dims_map, common_coords @@ -967,7 +974,7 @@ def _set_dims(var): def _broadcast_array(array: T_DataArray) -> T_DataArray: data = _set_dims(array.variable) - coords = dict(array.coords) + coords = array.coords.copy() coords.update(common_coords) return array.__class__( data, coords, data.dims, name=array.name, attrs=array.attrs @@ -975,7 +982,7 @@ def _broadcast_array(array: T_DataArray) -> T_DataArray: def _broadcast_dataset(ds: T_Dataset) -> T_Dataset: data_vars = {k: _set_dims(ds.variables[k]) for k in ds.data_vars} - coords = dict(ds.coords) + coords = ds.coords.copy() coords.update(common_coords) return ds.__class__(data_vars, coords, ds.attrs) From 00ecc27574d0326213b0b47e167ad18942f345dd Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sun, 3 Sep 2023 12:22:35 +0200 Subject: [PATCH 04/11] refactor polyfit (full multi-index support) --- xarray/core/dataset.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f1a0cb9dc34..6f28422589f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -8758,13 +8758,20 @@ def polyfit( skipna_da = bool(np.any(da.isnull())) dims_to_stack = [dimname for dimname in da.dims if dimname != dim] - stacked_coords: dict[Hashable, DataArray] = {} + stacked_coords = Coordinates() if dims_to_stack: stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") rhs = da.transpose(dim, *dims_to_stack).stack( {stacked_dim: dims_to_stack} ) - stacked_coords = {stacked_dim: rhs[stacked_dim]} + idx, idx_vars = rhs._to_temp_dataset()._get_stack_index( + stacked_dim, multi=True + ) + if idx is not None: + coords = Coordinates._construct_direct( + idx_vars, indexes={k: idx for k in idx_vars} + ) + stacked_coords.update(coords) scale_da = scale[:, np.newaxis] else: rhs = da @@ -8789,10 +8796,12 @@ def polyfit( # Thus a ReprObject => polyfit was called on a DataArray name = "" + coeffs_coords = stacked_coords.assign({degree_dim: np.arange(order)[::-1]}) + coeffs = DataArray( coeffs / scale_da, - dims=[degree_dim] + list(stacked_coords.keys()), - coords={degree_dim: np.arange(order)[::-1], **stacked_coords}, + dims=[degree_dim] + list(stacked_coords.dims), + coords=coeffs_coords, name=name + "polyfit_coefficients", ) if dims_to_stack: @@ -8802,7 +8811,7 @@ def polyfit( if full or (cov is True): residuals = DataArray( residuals if dims_to_stack else residuals.squeeze(), - dims=list(stacked_coords.keys()), + dims=stacked_coords.dims, coords=stacked_coords, name=name + "polyfit_residuals", ) From 36b8531109c73f94b416c179595b7e86e70ef989 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sun, 3 Sep 2023 13:28:04 +0200 Subject: [PATCH 05/11] refactor concat --- xarray/core/concat.py | 22 ++++++++++++++-------- xarray/tests/test_concat.py | 12 +++++++----- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d7aad8c7188..686aff699fa 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -8,6 +8,7 @@ from xarray.core import dtypes, utils from xarray.core.alignment import align, reindex_variables +from xarray.core.coordinates import Coordinates from xarray.core.duck_array_ops import lazy_array_equiv from xarray.core.indexes import Index, PandasIndex from xarray.core.merge import ( @@ -645,29 +646,34 @@ def get_indexes(name): # preserves original variable order result_vars[name] = result_vars.pop(name) - result = type(datasets[0])(result_vars, attrs=result_attrs) + result_coords = Coordinates( + coords={k: v for k, v in result_vars.items() if k in coord_names}, + indexes=result_indexes, + ) + result_data_vars = {k: v for k, v in result_vars.items() if k not in result_coords} + result = type(datasets[0])( + result_data_vars, coords=result_coords, attrs=result_attrs + ) absent_coord_names = coord_names - set(result.variables) if absent_coord_names: raise ValueError( f"Variables {absent_coord_names!r} are coordinates in some datasets but not others." ) - result = result.set_coords(coord_names) result.encoding = result_encoding result = result.drop_vars(unlabeled_dims, errors="ignore") if index is not None: - # add concat index / coordinate last to ensure that its in the final Dataset + # add concat index / coordinate last to ensure that it is in the final Dataset if dim_var is not None: index_vars = index.create_variables({dim: dim_var}) else: index_vars = index.create_variables() - result[dim] = index_vars[dim] - result_indexes[dim] = index - - # TODO: add indexes at Dataset creation (when it is supported) - result = result._overwrite_indexes(result_indexes) + index_coords = Coordinates._construct_direct( + coords=index_vars, indexes={k: index for k in index_vars} + ) + result = result.assign_coords(index_coords) return result diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 030f653e031..e3a0d0bee4a 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from xarray import DataArray, Dataset, Variable, concat +from xarray import Coordinates, DataArray, Dataset, Variable, concat from xarray.core import dtypes, merge from xarray.core.indexes import PandasIndex from xarray.tests import ( @@ -906,8 +906,9 @@ def test_concat_dim_is_dataarray(self) -> None: assert_identical(actual, expected) def test_concat_multiindex(self) -> None: - x = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]]) - expected = Dataset(coords={"x": x}) + midx = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]]) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + expected = Dataset(coords=midx_coords) actual = concat( [expected.isel(x=slice(2)), expected.isel(x=slice(2, None))], "x" ) @@ -917,8 +918,9 @@ def test_concat_multiindex(self) -> None: def test_concat_along_new_dim_multiindex(self) -> None: # see https://github.com/pydata/xarray/issues/6881 level_names = ["x_level_0", "x_level_1"] - x = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]], names=level_names) - ds = Dataset(coords={"x": x}) + midx = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]], names=level_names) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + ds = Dataset(coords=midx_coords) concatenated = concat([ds], "new") actual = list(concatenated.xindexes.get_all_coords("x")) expected = ["x"] + level_names From 99b2adad15746f34b1d02ced582277627adf0557 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sun, 3 Sep 2023 13:46:38 +0200 Subject: [PATCH 06/11] wip: 2nd pass silencing warnings --- xarray/tests/test_dask.py | 9 ++++++--- xarray/tests/test_indexing.py | 11 ++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 6e65d52fdb5..f4b8528a7e4 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -11,7 +11,7 @@ import pytest import xarray as xr -from xarray import DataArray, Dataset, Variable +from xarray import Coordinates, DataArray, Dataset, Variable from xarray.core import duck_array_ops from xarray.core.duck_array_ops import lazy_array_equiv from xarray.testing import assert_chunks_equal @@ -639,8 +639,11 @@ def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=("w", "x", "y")) stacked = arr.stack(z=("x", "y")) - z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=["x", "y"]) - expected = DataArray(data.reshape(2, -1), {"z": z}, dims=["w", "z"]) + midx = pd.MultiIndex.from_product( + [np.arange(3), np.arange(4)], names=["x", "y"] + ) + midx_coords = Coordinates.from_pandas_multiindex(midx, "z") + expected = DataArray(data.reshape(2, -1), coords=midx_coords, dims=["w", "z"]) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndEqual(expected, stacked) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 9f57b3b9056..8222d375a1a 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from xarray import DataArray, Dataset, Variable +from xarray import Coordinates, DataArray, Dataset, Variable from xarray.core import indexing, nputils from xarray.core.indexes import PandasIndex, PandasMultiIndex from xarray.core.types import T_Xarray @@ -68,9 +68,9 @@ def test_stacked_multiindex_min_max(self) -> None: def test_group_indexers_by_index(self) -> None: mindex = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) - data = DataArray( - np.zeros((4, 2, 2)), coords={"x": mindex, "y": [1, 2]}, dims=("x", "y", "z") - ) + coords = Coordinates.from_pandas_multiindex(mindex, "x") + coords["y"] = [1, 2] + data = DataArray(np.zeros((4, 2, 2)), coords=coords, dims=("x", "y", "z")) data.coords["y2"] = ("y", [2.0, 3.0]) grouped_indexers = indexing.group_indexers_by_index( @@ -146,7 +146,8 @@ def test_indexer( mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2], [-1, -2]], names=("one", "two", "three") ) - mdata = DataArray(range(8), [("x", mindex)]) + mindex_coords = Coordinates.from_pandas_multiindex(mindex, "x") + mdata = DataArray(range(8), coords=mindex_coords) test_indexer(data, 1, indexing.IndexSelResult({"x": 0})) test_indexer(data, np.int32(1), indexing.IndexSelResult({"x": 0})) From bf4a184a536bf8ce6aca650898df15f980128d87 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sun, 3 Sep 2023 14:26:22 +0200 Subject: [PATCH 07/11] wip: 3rd pass silencing warnings --- xarray/tests/test_dataarray.py | 58 ++++++++++++++++++++++++---------- xarray/tests/test_units.py | 29 ++++++++++------- 2 files changed, 60 insertions(+), 27 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index b4efe4ab2a7..af3cbeafcb8 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -15,6 +15,7 @@ import xarray as xr from xarray import ( + Coordinates, DataArray, Dataset, IndexVariable, @@ -27,7 +28,6 @@ from xarray.convert import from_cdms2 from xarray.core import dtypes from xarray.core.common import full_like -from xarray.core.coordinates import Coordinates from xarray.core.indexes import Index, PandasIndex, filter_indexes_from_coords from xarray.core.types import QueryEngineOptions, QueryParserOptions from xarray.core.utils import is_scalar @@ -79,7 +79,8 @@ def setup(self): self.mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("level_1", "level_2") ) - self.mda = DataArray([0, 1, 2, 3], coords={"x": self.mindex}, dims="x") + self.mindex_coords = Coordinates.from_pandas_multiindex(self.mindex, "x") + self.mda = DataArray([0, 1, 2, 3], coords=self.mindex_coords, dims="x") def test_repr(self) -> None: v = Variable(["time", "x"], [[1, 2, 3], [4, 5, 6]], {"foo": "bar"}) @@ -116,7 +117,8 @@ def test_repr_multiindex_long(self) -> None: [["a", "b", "c", "d"], [1, 2, 3, 4, 5, 6, 7, 8]], names=("level_1", "level_2"), ) - mda_long = DataArray(list(range(32)), coords={"x": mindex_long}, dims="x") + coords = Coordinates.from_pandas_multiindex(mindex_long, "x") + mda_long = DataArray(list(range(32)), coords=coords, dims="x") expected = dedent( """\ @@ -403,10 +405,17 @@ def test_constructor_invalid(self) -> None: with pytest.raises(ValueError, match=r"conflicting sizes for dim"): DataArray([1, 2], coords={"x": [0, 1], "y": ("x", [1])}, dims="x") - with pytest.raises(ValueError, match=r"conflicting MultiIndex"): - DataArray(np.random.rand(4, 4), [("x", self.mindex), ("y", self.mindex)]) - with pytest.raises(ValueError, match=r"conflicting MultiIndex"): - DataArray(np.random.rand(4, 4), [("x", self.mindex), ("level_1", range(4))]) + with pytest.warns( + FutureWarning, match=r".*MultiIndex.*no longer be implicitly promoted" + ): + with pytest.raises(ValueError, match=r"conflicting MultiIndex"): + DataArray( + np.random.rand(4, 4), [("x", self.mindex), ("y", self.mindex)] + ) + with pytest.raises(ValueError, match=r"conflicting MultiIndex"): + DataArray( + np.random.rand(4, 4), [("x", self.mindex), ("level_1", range(4))] + ) with pytest.raises(ValueError, match=r"matching the dimension size"): DataArray(data, coords={"x": 0}, dims=["x", "y"]) @@ -1314,7 +1323,8 @@ def test_selection_multiindex(self) -> None: mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2], [-1, -2]], names=("one", "two", "three") ) - mdata = DataArray(range(8), [("x", mindex)]) + mindex_coords = Coordinates.from_pandas_multiindex(mindex, "x") + mdata = DataArray(range(8), coords=mindex_coords) def test_sel( lab_indexer, pos_indexer, replaced_idx=False, renamed_dim=None @@ -1544,7 +1554,8 @@ def test_reset_coords(self) -> None: # non-dimension index coordinate midx = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=("lvl1", "lvl2")) - data = DataArray([1, 2, 3, 4], coords={"x": midx}, dims="x", name="foo") + mindex_coords = Coordinates.from_pandas_multiindex(midx, "x") + data = DataArray([1, 2, 3, 4], coords=mindex_coords, dims="x", name="foo") with pytest.raises(ValueError, match=r"cannot remove index"): data.reset_coords("lvl1") @@ -1901,9 +1912,13 @@ def test_swap_dims(self) -> None: # multiindex case idx = pd.MultiIndex.from_arrays([list("aab"), list("yzz")], names=["y1", "y2"]) + idx_coords = Coordinates.from_pandas_multiindex(idx, "y") array = DataArray(np.random.randn(3), {"y": ("x", idx)}, "x") - expected = DataArray(array.values, {"y": idx}, "y") - actual = array.swap_dims({"x": "y"}) + expected = DataArray(array.values, idx_coords, "y") + with pytest.warns( + FutureWarning, match=r".*MultiIndex.*no longer be implicitly promoted" + ): + actual = array.swap_dims({"x": "y"}) assert_identical(expected, actual) for dim_name in set().union(expected.xindexes.keys(), actual.xindexes.keys()): assert actual.xindexes[dim_name].equals(expected.xindexes[dim_name]) @@ -2152,7 +2167,8 @@ def test_reset_index_keep_attrs(self) -> None: def test_reorder_levels(self) -> None: midx = self.mindex.reorder_levels(["level_2", "level_1"]) - expected = DataArray(self.mda.values, coords={"x": midx}, dims="x") + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + expected = DataArray(self.mda.values, coords=midx_coords, dims="x") obj = self.mda.reorder_levels(x=["level_2", "level_1"]) assert_identical(obj, expected) @@ -2489,7 +2505,10 @@ def test_unstack_pandas_consistency(self) -> None: df = pd.DataFrame({"foo": range(3), "x": ["a", "b", "b"], "y": [0, 0, 1]}) s = df.set_index(["x", "y"])["foo"] expected = DataArray(s.unstack(), name="foo") - actual = DataArray(s, dims="z").unstack("z") + with pytest.warns( + FutureWarning, match=r".*MultiIndex.*no longer be implicitly promoted" + ): + actual = DataArray(s, dims="z").unstack("z") assert_identical(expected, actual) @pytest.mark.filterwarnings("error") @@ -2508,7 +2527,10 @@ def test_unstack_roundtrip_integer_array(self) -> None: def test_stack_nonunique_consistency(self, da) -> None: da = da.isel(time=0, drop=True) # 2D actual = da.stack(z=["a", "x"]) - expected = DataArray(da.to_pandas().stack(), dims="z") + with pytest.warns( + FutureWarning, match=r".*MultiIndex.*no longer be implicitly promoted" + ): + expected = DataArray(da.to_pandas().stack(), dims="z") assert_identical(expected, actual) def test_to_unstacked_dataset_raises_value_error(self) -> None: @@ -3293,8 +3315,10 @@ def test_to_dataframe_multiindex(self) -> None: arr_np = np.random.randn(4, 3) mindex = pd.MultiIndex.from_product([[1, 2], list("ab")], names=["A", "B"]) + coords = Coordinates.from_pandas_multiindex(mindex, "MI") + coords["C"] = [5, 6, 7] - arr = DataArray(arr_np, [("MI", mindex), ("C", [5, 6, 7])], name="foo") + arr = DataArray(arr_np, coords, name="foo") actual = arr.to_dataframe() assert_array_equal(actual["foo"].values, arr_np.flatten()) @@ -3308,8 +3332,10 @@ def test_to_dataframe_0length(self) -> None: arr_np = np.random.randn(4, 0) mindex = pd.MultiIndex.from_product([[1, 2], list("ab")], names=["A", "B"]) + coords = Coordinates.from_pandas_multiindex(mindex, "MI") + coords["C"] = [] - arr = DataArray(arr_np, [("MI", mindex), ("C", [])], name="foo") + arr = DataArray(arr_np, coords, name="foo") actual = arr.to_dataframe() assert len(actual) == 0 diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index addd7587544..f92554b572d 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -144,21 +144,28 @@ def strip_units(obj): strip_units(name): strip_units(value) for name, value in obj.data_vars.items() } - coords = { - strip_units(name): strip_units(value) for name, value in obj.coords.items() - } + coords = xr.Coordinates( + coords={ + strip_units(name): strip_units(value) + for name, value in obj.coords.items() + }, + indexes=dict(obj.xindexes), + ) new_obj = xr.Dataset(data_vars=data_vars, coords=coords) elif isinstance(obj, xr.DataArray): data = array_strip_units(obj.variable._data) - coords = { - strip_units(name): ( - (value.dims, array_strip_units(value.variable._data)) - if isinstance(value.data, Quantity) - else value # to preserve multiindexes - ) - for name, value in obj.coords.items() - } + coords = xr.Coordinates( + coords={ + strip_units(name): ( + (value.dims, array_strip_units(value.variable._data)) + if isinstance(value.data, Quantity) + else value # to preserve multiindexes + ) + for name, value in obj.coords.items() + }, + indexes=dict(obj.xindexes), + ) new_obj = xr.DataArray( name=strip_units(obj.name), data=data, coords=coords, dims=obj.dims From 514e1dd4876e8c358241f6f839538407a8c780c3 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sun, 3 Sep 2023 15:01:28 +0200 Subject: [PATCH 08/11] 4th pass silencing warnings --- xarray/tests/test_formatting_html.py | 3 ++- xarray/tests/test_groupby.py | 15 ++++++++++----- xarray/tests/test_variable.py | 5 +++-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_formatting_html.py b/xarray/tests/test_formatting_html.py index 7ea5c19019b..618334f924c 100644 --- a/xarray/tests/test_formatting_html.py +++ b/xarray/tests/test_formatting_html.py @@ -24,7 +24,8 @@ def multiindex(): mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("level_1", "level_2") ) - return xr.Dataset({}, {"x": mindex}) + mindex_coords = xr.Coordinates.from_pandas_multiindex(mindex, "x") + return xr.Dataset({}, mindex_coords) @pytest.fixture diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 5d99eda1e88..679a5e1e71c 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -9,7 +9,7 @@ import pytest import xarray as xr -from xarray import DataArray, Dataset, Variable +from xarray import Coordinates, DataArray, Dataset, Variable from xarray.core.groupby import _consolidate_slices from xarray.tests import ( InaccessibleArray, @@ -1080,7 +1080,8 @@ def setup(self): self.mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("level_1", "level_2") ) - self.mda = DataArray([0, 1, 2, 3], coords={"x": self.mindex}, dims="x") + self.mindex_coords = Coordinates.from_pandas_multiindex(self.mindex, "x") + self.mda = DataArray([0, 1, 2, 3], coords=self.mindex_coords, dims="x") self.da = self.dv.copy() self.da.coords["abc"] = ("y", np.array(["a"] * 9 + ["c"] + ["b"] * 10)) @@ -1095,14 +1096,16 @@ def test_stack_groupby_unsorted_coord(self): arr = xr.DataArray(data, dims=dims, coords={"y": y_vals}) actual1 = arr.stack(z=dims).groupby("z").first() midx1 = pd.MultiIndex.from_product([[0, 1], [2, 3]], names=dims) - expected1 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx1}) + midx1_coords = Coordinates.from_pandas_multiindex(midx1, "z") + expected1 = xr.DataArray(data_flat, dims=["z"], coords=midx1_coords) assert_equal(actual1, expected1) # GH: 3287. Note that y coord values are not in sorted order. arr = xr.DataArray(data, dims=dims, coords={"y": y_vals[::-1]}) actual2 = arr.stack(z=dims).groupby("z").first() midx2 = pd.MultiIndex.from_product([[0, 1], [3, 2]], names=dims) - expected2 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx2}) + midx2_coords = Coordinates.from_pandas_multiindex(midx2, "z") + expected2 = xr.DataArray(data_flat, dims=["z"], coords=midx2_coords) assert_equal(actual2, expected2) def test_groupby_iter(self): @@ -2356,7 +2359,9 @@ def test_groupby_binary_op_regression() -> None: def test_groupby_multiindex_level() -> None: # GH6836 midx = pd.MultiIndex.from_product([list("abc"), [0, 1]], names=("one", "two")) - mda = xr.DataArray(np.random.rand(6, 3), [("x", midx), ("y", range(3))]) + coords = Coordinates.from_pandas_multiindex(midx, "x") + coords["y"] = range(3) + mda = xr.DataArray(np.random.rand(6, 3), coords=coords) groups = mda.groupby("one").groups assert groups == {"a": [0, 1], "b": [2, 3], "c": [4, 5]} diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f30cdcf3f73..d0ec59413e5 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -11,7 +11,7 @@ import pytest import pytz -from xarray import DataArray, Dataset, IndexVariable, Variable, set_options +from xarray import Coordinates, DataArray, Dataset, IndexVariable, Variable, set_options from xarray.core import dtypes, duck_array_ops, indexing from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( @@ -2376,7 +2376,8 @@ def test_to_index(self): def test_to_index_multiindex_level(self): midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) - ds = Dataset(coords={"x": midx}) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + ds = Dataset(coords=midx_coords) assert ds.one.variable.to_index().equals(midx.get_level_values("one")) def test_multiindex_default_level_names(self): From 3786f5507d34e43d378e9685844b3796f67c56da Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sun, 3 Sep 2023 15:31:35 +0200 Subject: [PATCH 09/11] remove temp commented raise --- xarray/core/indexes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 6287c2f508c..7995289fc3c 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1355,7 +1355,6 @@ def create_default_index_implicit( if isinstance(array, pd.MultiIndex): if warn_multi_index: - # raise ValueError("no pd.MultiIndex please!") emit_user_level_warning( f"the `pandas.MultiIndex` object wrapped in variable {name} will no longer " "be implicitly promoted into multiple indexed coordinates in the future. " From 72ad345505419232bed9a98069c1b4c71b3fee44 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 7 Sep 2023 10:23:18 +0200 Subject: [PATCH 10/11] improve warning message Add suggestions for the cases where the pandas multi-index is passed via a pandas dataframe or series. --- xarray/core/indexes.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 7995289fc3c..5a57d9ebca6 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1356,10 +1356,13 @@ def create_default_index_implicit( if isinstance(array, pd.MultiIndex): if warn_multi_index: emit_user_level_warning( - f"the `pandas.MultiIndex` object wrapped in variable {name} will no longer " + f"the `pandas.MultiIndex` object wrapped in variable {name!r} will no longer " "be implicitly promoted into multiple indexed coordinates in the future. " - "If you want to pass it as coordinates, you need to first wrap it explicitly " - "using `xarray.Coordinates.from_pandas_multiindex()`.", + "If you want to keep this behavior, you need to first wrap it explicitly " + "using `xarray.Coordinates.from_pandas_multiindex()`. " + "If the multi-index was passed via a `pandas.DataFrame` or `pandas.Series` " + "object, please use `xarray.Dataset.from_dataframe` or " + "`xarray.DataArray.from_series` instead.", FutureWarning, ) index = PandasMultiIndex(array, name) From ef7dae0893f6701a203f8ec3c2e655bff7944b91 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 7 Sep 2023 10:28:05 +0200 Subject: [PATCH 11/11] update what's new --- doc/whats-new.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 157795f08d1..19cced407f2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -44,7 +44,7 @@ Deprecations :py:class:`Dataset` and :py:class:`DataArray` constructors as well as to :py:meth:`Dataset.assign` and :py:meth:`Dataset.assign_coords`. A new Xarray :py:class:`Coordinates` object has to be created first using - :py:meth:`Coordinates.from_pandas_multiindex` (:pull:`8094`). + :py:meth:`Coordinates.from_pandas_multiindex` (:pull:`8094`, :pull:`8140`). By `Benoît Bovy `_. Bug fixes @@ -57,6 +57,10 @@ Bug fixes names were not updated properly internally (:issue:`7405`, :issue:`7588`, :pull:`8104`). By `Benoît Bovy `_. +- Improved support of multi-coordinate indexes for a few functions and methods: + :py:func:`broadcast`, :py:meth:`Dataset.concat`, :py:meth:`Dataset.polyfit` + (:pull:`8140`). + By `Benoît Bovy `_. Documentation ~~~~~~~~~~~~~