Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate passing pd.MultiIndex implicitly #8140

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
6 changes: 5 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Deprecations
:py:class:`Dataset` and :py:class:`DataArray` constructors as well as to
:py:meth:`Dataset.assign` and :py:meth:`Dataset.assign_coords`.
A new Xarray :py:class:`Coordinates` object has to be created first using
:py:meth:`Coordinates.from_pandas_multiindex` (:pull:`8094`).
:py:meth:`Coordinates.from_pandas_multiindex` (:pull:`8094`, :pull:`8140`).
By `Benoît Bovy <https://github.com/benbovy>`_.

Bug fixes
Expand All @@ -57,6 +57,10 @@ Bug fixes
names were not updated properly internally (:issue:`7405`, :issue:`7588`,
:pull:`8104`).
By `Benoît Bovy <https://github.com/benbovy>`_.
- Improved support of multi-coordinate indexes for a few functions and methods:
:py:func:`broadcast`, :py:meth:`Dataset.concat`, :py:meth:`Dataset.polyfit`
(:pull:`8140`).
By `Benoît Bovy <https://github.com/benbovy>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
15 changes: 11 additions & 4 deletions xarray/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,14 +937,21 @@ def reindex_like(


def _get_broadcast_dims_map_common_coords(args, exclude):
common_coords = {}
from xarray.core.coordinates import Coordinates

common_coords = Coordinates()
dims_map = {}
for arg in args:
for dim in arg.dims:
if dim not in common_coords and dim not in exclude:
dims_map[dim] = arg.sizes[dim]
if dim in arg._indexes:
common_coords.update(arg.xindexes.get_all_coords(dim))
idx = arg._indexes[dim]
idx_vars = arg.xindexes.get_all_coords(dim)
coords = Coordinates._construct_direct(
idx_vars, {k: idx for k in idx_vars}
)
common_coords.update(coords)

return dims_map, common_coords

Expand All @@ -967,15 +974,15 @@ def _set_dims(var):

def _broadcast_array(array: T_DataArray) -> T_DataArray:
data = _set_dims(array.variable)
coords = dict(array.coords)
coords = array.coords.copy()
coords.update(common_coords)
return array.__class__(
data, coords, data.dims, name=array.name, attrs=array.attrs
)

def _broadcast_dataset(ds: T_Dataset) -> T_Dataset:
data_vars = {k: _set_dims(ds.variables[k]) for k in ds.data_vars}
coords = dict(ds.coords)
coords = ds.coords.copy()
coords.update(common_coords)
return ds.__class__(data_vars, coords, ds.attrs)

Expand Down
22 changes: 14 additions & 8 deletions xarray/core/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from xarray.core import dtypes, utils
from xarray.core.alignment import align, reindex_variables
from xarray.core.coordinates import Coordinates
from xarray.core.duck_array_ops import lazy_array_equiv
from xarray.core.indexes import Index, PandasIndex
from xarray.core.merge import (
Expand Down Expand Up @@ -645,29 +646,34 @@ def get_indexes(name):
# preserves original variable order
result_vars[name] = result_vars.pop(name)

result = type(datasets[0])(result_vars, attrs=result_attrs)
result_coords = Coordinates(
coords={k: v for k, v in result_vars.items() if k in coord_names},
indexes=result_indexes,
)
result_data_vars = {k: v for k, v in result_vars.items() if k not in result_coords}
result = type(datasets[0])(
result_data_vars, coords=result_coords, attrs=result_attrs
)

absent_coord_names = coord_names - set(result.variables)
if absent_coord_names:
raise ValueError(
f"Variables {absent_coord_names!r} are coordinates in some datasets but not others."
)
result = result.set_coords(coord_names)
result.encoding = result_encoding

result = result.drop_vars(unlabeled_dims, errors="ignore")

if index is not None:
# add concat index / coordinate last to ensure that its in the final Dataset
# add concat index / coordinate last to ensure that it is in the final Dataset
if dim_var is not None:
index_vars = index.create_variables({dim: dim_var})
else:
index_vars = index.create_variables()
result[dim] = index_vars[dim]
result_indexes[dim] = index

# TODO: add indexes at Dataset creation (when it is supported)
result = result._overwrite_indexes(result_indexes)
index_coords = Coordinates._construct_direct(
coords=index_vars, indexes={k: index for k in index_vars}
)
result = result.assign_coords(index_coords)

return result

Expand Down
30 changes: 5 additions & 25 deletions xarray/core/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,31 +951,11 @@ def create_coords_with_default_indexes(
indexes: dict[Hashable, Index] = {}
variables: dict[Hashable, Variable] = {}

# promote any pandas multi-index in data_vars as coordinates
coords_promoted: dict[Hashable, Any] = {}
pd_mindex_keys: list[Hashable] = []

for k, v in all_variables.items():
if isinstance(v, pd.MultiIndex):
coords_promoted[k] = v
pd_mindex_keys.append(k)
elif k in coords:
coords_promoted[k] = v

if pd_mindex_keys:
pd_mindex_keys_fmt = ",".join([f"'{k}'" for k in pd_mindex_keys])
emit_user_level_warning(
f"the `pandas.MultiIndex` object(s) passed as {pd_mindex_keys_fmt} coordinate(s) or "
"data variable(s) will no longer be implicitly promoted and wrapped into "
"multiple indexed coordinates in the future "
"(i.e., one coordinate for each multi-index level + one dimension coordinate). "
"If you want to keep this behavior, you need to first wrap it explicitly using "
"`mindex_coords = xarray.Coordinates.from_pandas_multiindex(mindex_obj, 'dim')` "
"and pass it as coordinates, e.g., `xarray.Dataset(coords=mindex_coords)`, "
"`dataset.assign_coords(mindex_coords)` or `dataarray.assign_coords(mindex_coords)`.",
FutureWarning,
)

coords_promoted: dict[Hashable, Any] = {
k: v
for k, v in all_variables.items()
if k in coords or isinstance(v, pd.MultiIndex)
}
dataarray_coords: list[DataArrayCoordinates] = []

for name, obj in coords_promoted.items():
Expand Down
19 changes: 14 additions & 5 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8761,13 +8761,20 @@ def polyfit(
skipna_da = bool(np.any(da.isnull()))

dims_to_stack = [dimname for dimname in da.dims if dimname != dim]
stacked_coords: dict[Hashable, DataArray] = {}
stacked_coords = Coordinates()
if dims_to_stack:
stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked")
rhs = da.transpose(dim, *dims_to_stack).stack(
{stacked_dim: dims_to_stack}
)
stacked_coords = {stacked_dim: rhs[stacked_dim]}
idx, idx_vars = rhs._to_temp_dataset()._get_stack_index(
stacked_dim, multi=True
)
if idx is not None:
coords = Coordinates._construct_direct(
idx_vars, indexes={k: idx for k in idx_vars}
)
stacked_coords.update(coords)
scale_da = scale[:, np.newaxis]
else:
rhs = da
Expand All @@ -8792,10 +8799,12 @@ def polyfit(
# Thus a ReprObject => polyfit was called on a DataArray
name = ""

coeffs_coords = stacked_coords.assign({degree_dim: np.arange(order)[::-1]})

coeffs = DataArray(
coeffs / scale_da,
dims=[degree_dim] + list(stacked_coords.keys()),
coords={degree_dim: np.arange(order)[::-1], **stacked_coords},
dims=[degree_dim] + list(stacked_coords.dims),
coords=coeffs_coords,
name=name + "polyfit_coefficients",
)
if dims_to_stack:
Expand All @@ -8805,7 +8814,7 @@ def polyfit(
if full or (cov is True):
residuals = DataArray(
residuals if dims_to_stack else residuals.squeeze(),
dims=list(stacked_coords.keys()),
dims=stacked_coords.dims,
coords=stacked_coords,
name=name + "polyfit_residuals",
)
Expand Down
18 changes: 16 additions & 2 deletions xarray/core/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1335,12 +1335,13 @@ def rename(self, name_dict, dims_dict):
def create_default_index_implicit(
dim_variable: Variable,
all_variables: Mapping | Iterable[Hashable] | None = None,
warn_multi_index: bool = True,
) -> tuple[PandasIndex, IndexVars]:
"""Create a default index from a dimension variable.

Create a PandasMultiIndex if the given variable wraps a pandas.MultiIndex,
otherwise create a PandasIndex (note that this will become obsolete once we
depreciate implicitly passing a pandas.MultiIndex as a coordinate).
deprecate implicitly passing a pandas.MultiIndex as a coordinate).

"""
if all_variables is None:
Expand All @@ -1353,6 +1354,17 @@ def create_default_index_implicit(
index: PandasIndex

if isinstance(array, pd.MultiIndex):
if warn_multi_index:
emit_user_level_warning(
f"the `pandas.MultiIndex` object wrapped in variable {name!r} will no longer "
"be implicitly promoted into multiple indexed coordinates in the future. "
"If you want to keep this behavior, you need to first wrap it explicitly "
"using `xarray.Coordinates.from_pandas_multiindex()`. "
"If the multi-index was passed via a `pandas.DataFrame` or `pandas.Series` "
"object, please use `xarray.Dataset.from_dataframe` or "
"`xarray.DataArray.from_series` instead.",
FutureWarning,
)
index = PandasMultiIndex(array, name)
index_vars = index.create_variables()
# check for conflict between level names and variable names
Expand Down Expand Up @@ -1680,7 +1692,9 @@ def default_indexes(

for name, var in coords.items():
if name in dims and var.ndim == 1:
index, index_vars = create_default_index_implicit(var, coords)
index, index_vars = create_default_index_implicit(
var, coords, warn_multi_index=False
)
if set(index_vars) <= coord_names:
indexes.update({k: index for k in index_vars})

Expand Down
12 changes: 7 additions & 5 deletions xarray/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
import pytest

from xarray import DataArray, Dataset, Variable, concat
from xarray import Coordinates, DataArray, Dataset, Variable, concat
from xarray.core import dtypes, merge
from xarray.core.indexes import PandasIndex
from xarray.tests import (
Expand Down Expand Up @@ -906,8 +906,9 @@ def test_concat_dim_is_dataarray(self) -> None:
assert_identical(actual, expected)

def test_concat_multiindex(self) -> None:
x = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]])
expected = Dataset(coords={"x": x})
midx = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]])
midx_coords = Coordinates.from_pandas_multiindex(midx, "x")
expected = Dataset(coords=midx_coords)
actual = concat(
[expected.isel(x=slice(2)), expected.isel(x=slice(2, None))], "x"
)
Expand All @@ -917,8 +918,9 @@ def test_concat_multiindex(self) -> None:
def test_concat_along_new_dim_multiindex(self) -> None:
# see https://github.com/pydata/xarray/issues/6881
level_names = ["x_level_0", "x_level_1"]
x = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]], names=level_names)
ds = Dataset(coords={"x": x})
midx = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]], names=level_names)
midx_coords = Coordinates.from_pandas_multiindex(midx, "x")
ds = Dataset(coords=midx_coords)
concatenated = concat([ds], "new")
actual = list(concatenated.xindexes.get_all_coords("x"))
expected = ["x"] + level_names
Expand Down
9 changes: 6 additions & 3 deletions xarray/tests/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pytest

import xarray as xr
from xarray import DataArray, Dataset, Variable
from xarray import Coordinates, DataArray, Dataset, Variable
from xarray.core import duck_array_ops
from xarray.core.duck_array_ops import lazy_array_equiv
from xarray.testing import assert_chunks_equal
Expand Down Expand Up @@ -639,8 +639,11 @@ def test_stack(self):
data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
arr = DataArray(data, dims=("w", "x", "y"))
stacked = arr.stack(z=("x", "y"))
z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=["x", "y"])
expected = DataArray(data.reshape(2, -1), {"z": z}, dims=["w", "z"])
midx = pd.MultiIndex.from_product(
[np.arange(3), np.arange(4)], names=["x", "y"]
)
midx_coords = Coordinates.from_pandas_multiindex(midx, "z")
expected = DataArray(data.reshape(2, -1), coords=midx_coords, dims=["w", "z"])
assert stacked.data.chunks == expected.data.chunks
self.assertLazyAndEqual(expected, stacked)

Expand Down
Loading