From 1fedfd86604f87538d1953b01d6990c2c89fcbf3 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 29 Aug 2023 16:23:28 +0200 Subject: [PATCH] Refactor update coordinates to better handle multi-coordinate indexes (#8094) * generic warning implicitly wrap a pd.MultiIndex * refactor update_coords (assign) Fix more cases with multi-coordinate indexes: - do not try to align existing indexed coordinates with the new coordinates that will fully replace them - raise early if the new coordinates would corrupt the existing indexed coordinates - isolate PandasMultiIndex special cases so that it will be easier to drop support for it later (and warn now about deprecation) * fix alignment of updated coordinates when DataArray objects are passed as new coordinate objects * refactor Dataset.assign Need to update (replace) coordinates and data variables separately to ensure it goes through all (indexed) coordinate update checks. * fix and update tests * nit * fix mypy? * update what's new * fix import error * fix performance regression * nit * use warning util func and improve messages --- doc/whats-new.rst | 11 +- xarray/core/coordinates.py | 193 ++++++++++++++++++++------------- xarray/core/dataset.py | 22 +++- xarray/tests/test_dataarray.py | 10 +- xarray/tests/test_dataset.py | 71 +++++++++--- 5 files changed, 205 insertions(+), 102 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4cd24a54fc8..3eacbce2895 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,7 +14,6 @@ What's New np.random.seed(123456) - .. _whats-new.2023.08.1: v2023.08.1 (unreleased) @@ -31,10 +30,19 @@ Breaking changes Deprecations ~~~~~~~~~~~~ +- Deprecate passing a :py:class:`pandas.MultiIndex` object directly to the + :py:class:`Dataset` and :py:class:`DataArray` constructors as well as to + :py:meth:`Dataset.assign` and :py:meth:`Dataset.assign_coords`. + A new Xarray :py:class:`Coordinates` object has to be created first using + :py:meth:`Coordinates.from_pandas_multiindex` (:pull:`8094`). + By `Benoît Bovy `_. Bug fixes ~~~~~~~~~ +- Improved handling of multi-coordinate indexes when updating coordinates, including bug fixes + (and improved warnings for deprecated features) for pandas multi-indexes (:pull:`8094`). + By `Benoît Bovy `_. Documentation ~~~~~~~~~~~~~ @@ -120,7 +128,6 @@ Breaking changes numbagg 0.1 0.2.1 ===================== ========= ======== - Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index f03d98f781a..7c14a8c3d0a 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from collections.abc import Hashable, Iterator, Mapping, Sequence from contextlib import contextmanager from typing import ( @@ -24,7 +23,7 @@ ) from xarray.core.merge import merge_coordinates_without_align, merge_coords from xarray.core.types import Self, T_DataArray -from xarray.core.utils import Frozen, ReprObject +from xarray.core.utils import Frozen, ReprObject, emit_user_level_warning from xarray.core.variable import Variable, as_variable, calculate_dimensions if TYPE_CHECKING: @@ -83,7 +82,7 @@ def variables(self): def _update_coords(self, coords, indexes): raise NotImplementedError() - def _maybe_drop_multiindex_coords(self, coords): + def _drop_coords(self, coord_names): raise NotImplementedError() def __iter__(self) -> Iterator[Hashable]: @@ -379,9 +378,9 @@ def _update_coords( # redirect to DatasetCoordinates._update_coords self._data.coords._update_coords(coords, indexes) - def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None: - # redirect to DatasetCoordinates._maybe_drop_multiindex_coords - self._data.coords._maybe_drop_multiindex_coords(coords) + def _drop_coords(self, coord_names): + # redirect to DatasetCoordinates._drop_coords + self._data.coords._drop_coords(coord_names) def _merge_raw(self, other, reflexive): """For use with binary arithmetic.""" @@ -454,22 +453,40 @@ def __setitem__(self, key: Hashable, value: Any) -> None: def update(self, other: Mapping[Any, Any]) -> None: """Update this Coordinates variables with other coordinate variables.""" - other_obj: Coordinates | Mapping[Hashable, Variable] + + if not len(other): + return + + other_coords: Coordinates if isinstance(other, Coordinates): - # special case: default indexes won't be created - other_obj = other + # Coordinates object: just pass it (default indexes won't be created) + other_coords = other else: - other_obj = getattr(other, "variables", other) + other_coords = create_coords_with_default_indexes( + getattr(other, "variables", other) + ) - self._maybe_drop_multiindex_coords(set(other_obj)) + # Discard original indexed coordinates prior to merge allows to: + # - fail early if the new coordinates don't preserve the integrity of existing + # multi-coordinate indexes + # - drop & replace coordinates without alignment (note: we must keep indexed + # coordinates extracted from the DataArray objects passed as values to + # `other` - if any - as those are still used for aligning the old/new coordinates) + coords_to_align = drop_indexed_coords(set(other_coords) & set(other), self) coords, indexes = merge_coords( - [self.variables, other_obj], + [coords_to_align, other_coords], priority_arg=1, - indexes=self.xindexes, + indexes=coords_to_align.xindexes, ) + # special case for PandasMultiIndex: updating only its dimension coordinate + # is still allowed but depreciated. + # It is the only case where we need to actually drop coordinates here (multi-index levels) + # TODO: remove when removing PandasMultiIndex's dimension coordinate. + self._drop_coords(self._names - coords_to_align._names) + self._update_coords(coords, indexes) def _overwrite_indexes( @@ -610,15 +627,20 @@ def _update_coords( original_indexes.update(indexes) self._data._indexes = original_indexes - def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None: - """Drops variables in coords, and any associated variables as well.""" + def _drop_coords(self, coord_names): + # should drop indexed coordinates only + for name in coord_names: + del self._data._variables[name] + del self._data._indexes[name] + self._data._coord_names.difference_update(coord_names) + + def _drop_indexed_coords(self, coords_to_drop: set[Hashable]) -> None: assert self._data.xindexes is not None - variables, indexes = drop_coords( - coords, self._data._variables, self._data.xindexes - ) - self._data._coord_names.intersection_update(variables) - self._data._variables = variables - self._data._indexes = indexes + new_coords = drop_indexed_coords(coords_to_drop, self) + for name in self._data._coord_names - new_coords._names: + del self._data._variables[name] + self._data._indexes = dict(new_coords.xindexes) + self._data._coord_names.intersection_update(new_coords._names) def __delitem__(self, key: Hashable) -> None: if key in self: @@ -691,13 +713,11 @@ def _update_coords( original_indexes.update(indexes) self._data._indexes = original_indexes - def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None: - """Drops variables in coords, and any associated variables as well.""" - variables, indexes = drop_coords( - coords, self._data._coords, self._data.xindexes - ) - self._data._coords = variables - self._data._indexes = indexes + def _drop_coords(self, coord_names): + # should drop indexed coordinates only + for name in coord_names: + del self._data._coords[name] + del self._data._indexes[name] @property def variables(self): @@ -724,35 +744,48 @@ def _ipython_key_completions_(self): return self._data._ipython_key_completions_() -def drop_coords( - coords_to_drop: set[Hashable], variables, indexes: Indexes -) -> tuple[dict, dict]: - """Drop index variables associated with variables in coords_to_drop.""" - # Only warn when we're dropping the dimension with the multi-indexed coordinate - # If asked to drop a subset of the levels in a multi-index, we raise an error - # later but skip the warning here. - new_variables = dict(variables.copy()) - new_indexes = dict(indexes.copy()) - for key in coords_to_drop & set(indexes): - maybe_midx = indexes[key] - idx_coord_names = set(indexes.get_all_coords(key)) - if ( - isinstance(maybe_midx, PandasMultiIndex) - and key == maybe_midx.dim - and (idx_coord_names - coords_to_drop) - ): - warnings.warn( - f"Updating MultiIndexed coordinate {key!r} would corrupt indices for " - f"other variables: {list(maybe_midx.index.names)!r}. " - f"This will raise an error in the future. Use `.drop_vars({idx_coord_names!r})` before " +def drop_indexed_coords( + coords_to_drop: set[Hashable], coords: Coordinates +) -> Coordinates: + """Drop indexed coordinates associated with coordinates in coords_to_drop. + + This will raise an error in case it corrupts any passed index and its + coordinate variables. + + """ + new_variables = dict(coords.variables) + new_indexes = dict(coords.xindexes) + + for idx, idx_coords in coords.xindexes.group_by_index(): + idx_drop_coords = set(idx_coords) & coords_to_drop + + # special case for pandas multi-index: still allow but deprecate + # dropping only its dimension coordinate. + # TODO: remove when removing PandasMultiIndex's dimension coordinate. + if isinstance(idx, PandasMultiIndex) and idx_drop_coords == {idx.dim}: + idx_drop_coords.update(idx.index.names) + emit_user_level_warning( + f"updating coordinate {idx.dim!r} with a PandasMultiIndex would leave " + f"the multi-index level coordinates {list(idx.index.names)!r} in an inconsistent state. " + f"This will raise an error in the future. Use `.drop_vars({list(idx_coords)!r})` before " "assigning new coordinate values.", FutureWarning, - stacklevel=4, ) - for k in idx_coord_names: - del new_variables[k] - del new_indexes[k] - return new_variables, new_indexes + + elif idx_drop_coords and len(idx_drop_coords) != len(idx_coords): + idx_drop_coords_str = ", ".join(f"{k!r}" for k in idx_drop_coords) + idx_coords_str = ", ".join(f"{k!r}" for k in idx_coords) + raise ValueError( + f"cannot drop or update coordinate(s) {idx_drop_coords_str}, which would corrupt " + f"the following index built from coordinates {idx_coords_str}:\n" + f"{idx}" + ) + + for k in idx_drop_coords: + del new_variables[k] + del new_indexes[k] + + return Coordinates._construct_direct(coords=new_variables, indexes=new_indexes) def assert_coordinate_consistent( @@ -773,11 +806,15 @@ def assert_coordinate_consistent( def create_coords_with_default_indexes( - coords: Mapping[Any, Any], data_vars: Mapping[Any, Variable] | None = None + coords: Mapping[Any, Any], data_vars: Mapping[Any, Any] | None = None ) -> Coordinates: - """Maybe create default indexes from a mapping of coordinates.""" + """Returns a Coordinates object from a mapping of coordinates (arbitrary objects). + + Create default (pandas) indexes for each of the input dimension coordinates. + Extract coordinates from each input DataArray. - # Note: data_vars are needed here only because a pd.MultiIndex object + """ + # Note: data_vars is needed here only because a pd.MultiIndex object # can be promoted as coordinates. # TODO: It won't be relevant anymore when this behavior will be dropped # in favor of the more explicit ``Coordinates.from_pandas_multiindex()``. @@ -791,34 +828,34 @@ def create_coords_with_default_indexes( indexes: dict[Hashable, Index] = {} variables: dict[Hashable, Variable] = {} - maybe_index_vars: dict[Hashable, Variable] = {} - mindex_data_vars: list[Hashable] = [] + # promote any pandas multi-index in data_vars as coordinates + coords_promoted: dict[Hashable, Any] = {} + pd_mindex_keys: list[Hashable] = [] for k, v in all_variables.items(): - if k in coords: - maybe_index_vars[k] = v - elif isinstance(v, pd.MultiIndex): - # TODO: eventually stop promoting multi-index passed via data variables - mindex_data_vars.append(k) - maybe_index_vars[k] = v - - if mindex_data_vars: - warnings.warn( - f"passing one or more `pandas.MultiIndex` via data variable(s) {mindex_data_vars} " - "will no longer create indexed coordinates in the future. " - "If you want to keep this behavior, pass it as coordinates instead.", + if isinstance(v, pd.MultiIndex): + coords_promoted[k] = v + pd_mindex_keys.append(k) + elif k in coords: + coords_promoted[k] = v + + if pd_mindex_keys: + pd_mindex_keys_fmt = ",".join([f"'{k}'" for k in pd_mindex_keys]) + emit_user_level_warning( + f"the `pandas.MultiIndex` object(s) passed as {pd_mindex_keys_fmt} coordinate(s) or " + "data variable(s) will no longer be implicitly promoted and wrapped into " + "multiple indexed coordinates in the future " + "(i.e., one coordinate for each multi-index level + one dimension coordinate). " + "If you want to keep this behavior, you need to first wrap it explicitly using " + "`mindex_coords = xarray.Coordinates.from_pandas_multiindex(mindex_obj, 'dim')` " + "and pass it as coordinates, e.g., `xarray.Dataset(coords=mindex_coords)`, " + "`dataset.assign_coords(mindex_coords)` or `dataarray.assign_coords(mindex_coords)`.", FutureWarning, ) - maybe_index_vars = { - k: v - for k, v in all_variables.items() - if k in coords or isinstance(v, pd.MultiIndex) - } - dataarray_coords: list[DataArrayCoordinates] = [] - for name, obj in maybe_index_vars.items(): + for name, obj in coords_promoted.items(): if isinstance(obj, DataArray): dataarray_coords.append(obj.coords) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bdf2d8babe1..9a53f24c67c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -120,7 +120,7 @@ from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes from xarray.core.dataarray import DataArray from xarray.core.groupby import DatasetGroupBy - from xarray.core.merge import CoercibleMapping + from xarray.core.merge import CoercibleMapping, CoercibleValue from xarray.core.parallelcompat import ChunkManagerEntrypoint from xarray.core.resample import DatasetResample from xarray.core.rolling import DatasetCoarsen, DatasetRolling @@ -6879,6 +6879,10 @@ def assign( possible, but you cannot reference other variables created within the same ``assign`` call. + The new assigned variables that replace existing coordinates in the + original dataset are still listed as coordinates in the returned + Dataset. + See Also -------- pandas.DataFrame.assign @@ -6934,11 +6938,23 @@ def assign( """ variables = either_dict_or_kwargs(variables, variables_kwargs, "assign") data = self.copy() + # do all calculations first... results: CoercibleMapping = data._calc_assign_results(variables) - data.coords._maybe_drop_multiindex_coords(set(results.keys())) + + # split data variables to add/replace vs. coordinates to replace + results_data_vars: dict[Hashable, CoercibleValue] = {} + results_coords: dict[Hashable, CoercibleValue] = {} + for k, v in results.items(): + if k in data._coord_names: + results_coords[k] = v + else: + results_data_vars[k] = v + # ... and then assign - data.update(results) + data.coords.update(results_coords) + data.update(results_data_vars) + return data def to_array( diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 183c0ad7371..f615ae70b6b 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1435,7 +1435,7 @@ def test_coords(self) -> None: assert_identical(da, expected) with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " + ValueError, match=r"cannot drop or update coordinate.*corrupt.*index " ): self.mda["level_1"] = ("x", np.arange(4)) self.mda.coords["level_1"] = ("x", np.arange(4)) @@ -1555,7 +1555,7 @@ def test_assign_coords(self) -> None: assert_identical(actual, expected) with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " + ValueError, match=r"cannot drop or update coordinate.*corrupt.*index " ): self.mda.assign_coords(level_1=("x", range(4))) @@ -1570,7 +1570,9 @@ def test_assign_coords(self) -> None: def test_assign_coords_existing_multiindex(self) -> None: data = self.mda - with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): + with pytest.warns( + FutureWarning, match=r"updating coordinate.*MultiIndex.*inconsistent" + ): data.assign_coords(x=range(4)) def test_assign_coords_custom_index(self) -> None: @@ -1608,7 +1610,7 @@ def test_set_coords_update_index(self) -> None: def test_set_coords_multiindex_level(self) -> None: with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " + ValueError, match=r"cannot drop or update coordinate.*corrupt.*index " ): self.mda["level_1"] = range(4) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 5304c54971a..3110c4e2882 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -199,7 +199,7 @@ def create_test_multiindex() -> Dataset: mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("level_1", "level_2") ) - return Dataset({}, {"x": mindex}) + return Dataset({}, Coordinates.from_pandas_multiindex(mindex, "x")) def create_test_stacked_array() -> tuple[DataArray, DataArray]: @@ -648,10 +648,17 @@ def test_constructor_multiindex(self) -> None: assert_identical(ds, coords.to_dataset()) with pytest.warns( - FutureWarning, match=".*`pandas.MultiIndex` via data variable.*" + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): Dataset(data_vars={"x": midx}) + with pytest.warns( + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", + ): + Dataset(coords={"x": midx}) + def test_constructor_custom_index(self) -> None: class CustomIndex(Index): ... @@ -872,7 +879,7 @@ def test_coords_modify(self) -> None: assert_array_equal(actual["z"], ["a", "b"]) actual = data.copy(deep=True) - with pytest.raises(ValueError, match=r"conflicting sizes"): + with pytest.raises(ValueError, match=r"conflicting dimension sizes"): actual.coords["x"] = ("x", [-1]) assert_identical(actual, data) # should not be modified @@ -909,9 +916,7 @@ def test_coords_setitem_with_new_dimension(self) -> None: def test_coords_setitem_multiindex(self) -> None: data = create_test_multiindex() - with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " - ): + with pytest.raises(ValueError, match=r"cannot drop or update.*corrupt.*index "): data.coords["level_1"] = range(4) def test_coords_set(self) -> None: @@ -4244,22 +4249,58 @@ def test_assign_attrs(self) -> None: def test_assign_multiindex_level(self) -> None: data = create_test_multiindex() - with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " - ): + with pytest.raises(ValueError, match=r"cannot drop or update.*corrupt.*index "): data.assign(level_1=range(4)) data.assign_coords(level_1=range(4)) + def test_assign_new_multiindex(self) -> None: + midx = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [0, 1, 0, 1]]) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + + ds = Dataset(coords={"x": [1, 2]}) + expected = Dataset(coords=midx_coords) + + with pytest.warns( + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", + ): + actual = ds.assign(x=midx) + assert_identical(actual, expected) + + @pytest.mark.parametrize("orig_coords", [{}, {"x": range(4)}]) + def test_assign_coords_new_multiindex(self, orig_coords) -> None: + ds = Dataset(coords=orig_coords) + midx = pd.MultiIndex.from_arrays( + [["a", "a", "b", "b"], [0, 1, 0, 1]], names=("one", "two") + ) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + + expected = Dataset(coords=midx_coords) + + with pytest.warns( + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", + ): + actual = ds.assign_coords({"x": midx}) + assert_identical(actual, expected) + + actual = ds.assign_coords(midx_coords) + assert_identical(actual, expected) + def test_assign_coords_existing_multiindex(self) -> None: data = create_test_multiindex() - with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): - data.assign_coords(x=range(4)) - - with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): - data.assign(x=range(4)) + with pytest.warns( + FutureWarning, match=r"updating coordinate.*MultiIndex.*inconsistent" + ): + updated = data.assign_coords(x=range(4)) + # https://github.com/pydata/xarray/issues/7097 (coord names updated) + assert len(updated.coords) == 1 + with pytest.warns( + FutureWarning, match=r"updating coordinate.*MultiIndex.*inconsistent" + ): + updated = data.assign(x=range(4)) # https://github.com/pydata/xarray/issues/7097 (coord names updated) - updated = data.assign_coords(x=range(4)) assert len(updated.coords) == 1 def test_assign_all_multiindex_coords(self) -> None: