Skip to content

Commit

Permalink
Merge branch 'main' into help-fig
Browse files Browse the repository at this point in the history
  • Loading branch information
JessicaS11 authored Jun 24, 2024
2 parents 0d3f667 + 872c1c5 commit 4cfebee
Show file tree
Hide file tree
Showing 9 changed files with 138 additions and 78 deletions.
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,12 @@ Bug fixes

Documentation
~~~~~~~~~~~~~

- Adds a flow-chart diagram to help users navigate help resources (`Discussion #8990 <https://github.com/pydata/xarray/discussions/8990>`_).
By `Jessica Scheick <https://github.com/jessicas11>`_.
- Improvements to Zarr & chunking docs (:pull:`9139`, :pull:`9140`, :pull:`9132`)
By `Maximilian Roos <https://github.com/max-sixty>`_


Internal Changes
~~~~~~~~~~~~~~~~
Expand Down
7 changes: 1 addition & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,12 @@ module = [
"cloudpickle.*",
"cubed.*",
"cupy.*",
"dask.types.*",
"fsspec.*",
"h5netcdf.*",
"h5py.*",
"iris.*",
"matplotlib.*",
"mpl_toolkits.*",
"nc_time_axis.*",
"numbagg.*",
"netCDF4.*",
"netcdftime.*",
"opt_einsum.*",
Expand All @@ -127,7 +124,6 @@ module = [
"pooch.*",
"pyarrow.*",
"pydap.*",
"pytest.*",
"scipy.*",
"seaborn.*",
"setuptools",
Expand Down Expand Up @@ -329,8 +325,7 @@ filterwarnings = [
"default:the `pandas.MultiIndex` object:FutureWarning:xarray.tests.test_variable",
"default:Using a non-tuple sequence for multidimensional indexing is deprecated:FutureWarning",
"default:Duplicate dimension names present:UserWarning:xarray.namedarray.core",
"default:::xarray.tests.test_strategies",
# TODO: remove once we know how to deal with a changed signature in protocols
"default:::xarray.tests.test_strategies", # TODO: remove once we know how to deal with a changed signature in protocols
"ignore:__array__ implementation doesn't accept a copy keyword, so passing copy=False failed.",
]

Expand Down
43 changes: 26 additions & 17 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,15 +425,19 @@ def open_dataset(
is chosen based on available dependencies, with a preference for
"netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``)
can also be used.
chunks : int, dict, 'auto' or None, optional
If chunks is provided, it is used to load the new dataset into dask
arrays. ``chunks=-1`` loads the dataset with dask using a single
chunk for all arrays. ``chunks={}`` loads the dataset with dask using
engine preferred chunks if exposed by the backend, otherwise with
a single chunk for all arrays. In order to reproduce the default behavior
of ``xr.open_zarr(...)`` use ``xr.open_dataset(..., engine='zarr', chunks={})``.
``chunks='auto'`` will use dask ``auto`` chunking taking into account the
engine preferred chunks. See dask chunking for more details.
chunks : int, dict, 'auto' or None, default: None
If provided, used to load the data into dask arrays.
- ``chunks="auto"`` will use dask ``auto`` chunking taking into account the
engine preferred chunks.
- ``chunks=None`` skips using dask, which is generally faster for
small arrays.
- ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
- ``chunks={}`` loads the data with dask using the engine's preferred chunk
size, generally identical to the format's chunk size. If not available, a
single chunk for all arrays.
See dask chunking for more details.
cache : bool, optional
If True, cache data loaded from the underlying datastore in memory as
NumPy arrays when accessed to avoid reading from the underlying data-
Expand Down Expand Up @@ -631,14 +635,19 @@ def open_dataarray(
Engine to use when reading files. If not provided, the default engine
is chosen based on available dependencies, with a preference for
"netcdf4".
chunks : int, dict, 'auto' or None, optional
If chunks is provided, it is used to load the new dataset into dask
arrays. ``chunks=-1`` loads the dataset with dask using a single
chunk for all arrays. `chunks={}`` loads the dataset with dask using
engine preferred chunks if exposed by the backend, otherwise with
a single chunk for all arrays.
``chunks='auto'`` will use dask ``auto`` chunking taking into account the
engine preferred chunks. See dask chunking for more details.
chunks : int, dict, 'auto' or None, default: None
If provided, used to load the data into dask arrays.
- ``chunks='auto'`` will use dask ``auto`` chunking taking into account the
engine preferred chunks.
- ``chunks=None`` skips using dask, which is generally faster for
small arrays.
- ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
- ``chunks={}`` loads the data with dask using engine preferred chunks if
exposed by the backend, otherwise with a single chunk for all arrays.
See dask chunking for more details.
cache : bool, optional
If True, cache data loaded from the underlying datastore in memory as
NumPy arrays when accessed to avoid reading from the underlying data-
Expand Down
18 changes: 12 additions & 6 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,12 +973,18 @@ def open_zarr(
Array synchronizer provided to zarr
group : str, optional
Group path. (a.k.a. `path` in zarr terminology.)
chunks : int or dict or tuple or {None, 'auto'}, optional
Chunk sizes along each dimension, e.g., ``5`` or
``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created
based on the variable's zarr chunks. If `chunks=None`, zarr array
data will lazily convert to numpy arrays upon access. This accepts
all the chunk specifications as Dask does.
chunks : int, dict, 'auto' or None, default: 'auto'
If provided, used to load the data into dask arrays.
- ``chunks='auto'`` will use dask ``auto`` chunking taking into account the
engine preferred chunks.
- ``chunks=None`` skips using dask, which is generally faster for
small arrays.
- ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
- ``chunks={}`` loads the data with dask using engine preferred chunks if
exposed by the backend, otherwise with a single chunk for all arrays.
See dask chunking for more details.
overwrite_encoded_chunks : bool, optional
Whether to drop the zarr chunks encoded for each variable when a
dataset is loaded with specified chunk sizes (default: False)
Expand Down
28 changes: 15 additions & 13 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2458,24 +2458,26 @@ def to_zarr(
If set, the dimension along which the data will be appended. All
other dimensions on overridden variables must remain the same size.
region : dict or "auto", optional
Optional mapping from dimension names to integer slices along
dataset dimensions to indicate the region of existing zarr array(s)
in which to write this dataset's data. For example,
``{'x': slice(0, 1000), 'y': slice(10000, 11000)}`` would indicate
that values should be written to the region ``0:1000`` along ``x``
and ``10000:11000`` along ``y``.
Can also specify ``"auto"``, in which case the existing store will be
opened and the region inferred by matching the new data's coordinates.
``"auto"`` can be used as a single string, which will automatically infer
the region for all dimensions, or as dictionary values for specific
dimensions mixed together with explicit slices for other dimensions.
Optional mapping from dimension names to either a) ``"auto"``, or b) integer
slices, indicating the region of existing zarr array(s) in which to write
this dataset's data.
If ``"auto"`` is provided the existing store will be opened and the region
inferred by matching indexes. ``"auto"`` can be used as a single string,
which will automatically infer the region for all dimensions, or as
dictionary values for specific dimensions mixed together with explicit
slices for other dimensions.
Alternatively integer slices can be provided; for example, ``{'x': slice(0,
1000), 'y': slice(10000, 11000)}`` would indicate that values should be
written to the region ``0:1000`` along ``x`` and ``10000:11000`` along
``y``.
Two restrictions apply to the use of ``region``:
- If ``region`` is set, _all_ variables in a dataset must have at
least one dimension in common with the region. Other variables
should be written in a separate call to ``to_zarr()``.
should be written in a separate single call to ``to_zarr()``.
- Dimensions cannot be included in both ``region`` and
``append_dim`` at the same time. To create empty arrays to fill
in with ``region``, use a separate call to ``to_zarr()`` with
Expand Down
63 changes: 41 additions & 22 deletions xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from xarray.core.arithmetic import DataArrayGroupbyArithmetic, DatasetGroupbyArithmetic
from xarray.core.common import ImplementsArrayReduce, ImplementsDatasetReduce
from xarray.core.concat import concat
from xarray.core.coordinates import Coordinates
from xarray.core.formatting import format_array_flat
from xarray.core.indexes import (
PandasIndex,
create_default_index_implicit,
filter_indexes_from_coords,
)
Expand Down Expand Up @@ -246,7 +248,7 @@ def to_array(self) -> DataArray:
return self.to_dataarray()


T_Group = Union["T_DataArray", "IndexVariable", _DummyGroup]
T_Group = Union["T_DataArray", _DummyGroup]


def _ensure_1d(group: T_Group, obj: T_DataWithCoords) -> tuple[
Expand All @@ -256,7 +258,7 @@ def _ensure_1d(group: T_Group, obj: T_DataWithCoords) -> tuple[
list[Hashable],
]:
# 1D cases: do nothing
if isinstance(group, (IndexVariable, _DummyGroup)) or group.ndim == 1:
if isinstance(group, _DummyGroup) or group.ndim == 1:
return group, obj, None, []

from xarray.core.dataarray import DataArray
Expand All @@ -271,9 +273,7 @@ def _ensure_1d(group: T_Group, obj: T_DataWithCoords) -> tuple[
newobj = obj.stack({stacked_dim: orig_dims})
return newgroup, newobj, stacked_dim, inserted_dims

raise TypeError(
f"group must be DataArray, IndexVariable or _DummyGroup, got {type(group)!r}."
)
raise TypeError(f"group must be DataArray or _DummyGroup, got {type(group)!r}.")


@dataclass
Expand All @@ -299,7 +299,7 @@ class ResolvedGrouper(Generic[T_DataWithCoords]):
codes: DataArray = field(init=False)
full_index: pd.Index = field(init=False)
group_indices: T_GroupIndices = field(init=False)
unique_coord: IndexVariable | _DummyGroup = field(init=False)
unique_coord: Variable | _DummyGroup = field(init=False)

# _ensure_1d:
group1d: T_Group = field(init=False)
Expand All @@ -315,7 +315,7 @@ def __post_init__(self) -> None:
# might be used multiple times.
self.grouper = copy.deepcopy(self.grouper)

self.group: T_Group = _resolve_group(self.obj, self.group)
self.group = _resolve_group(self.obj, self.group)

(
self.group1d,
Expand All @@ -328,14 +328,18 @@ def __post_init__(self) -> None:

@property
def name(self) -> Hashable:
"""Name for the grouped coordinate after reduction."""
# the name has to come from unique_coord because we need `_bins` suffix for BinGrouper
return self.unique_coord.name
(name,) = self.unique_coord.dims
return name

@property
def size(self) -> int:
"""Number of groups."""
return len(self)

def __len__(self) -> int:
"""Number of groups."""
return len(self.full_index)

@property
Expand All @@ -358,8 +362,8 @@ def factorize(self) -> None:
]
if encoded.unique_coord is None:
unique_values = self.full_index[np.unique(encoded.codes)]
self.unique_coord = IndexVariable(
self.codes.name, unique_values, attrs=self.group.attrs
self.unique_coord = Variable(
dims=self.codes.name, data=unique_values, attrs=self.group.attrs
)
else:
self.unique_coord = encoded.unique_coord
Expand All @@ -378,7 +382,9 @@ def _validate_groupby_squeeze(squeeze: bool | None) -> None:
)


def _resolve_group(obj: T_DataWithCoords, group: T_Group | Hashable) -> T_Group:
def _resolve_group(
obj: T_DataWithCoords, group: T_Group | Hashable | IndexVariable
) -> T_Group:
from xarray.core.dataarray import DataArray

error_msg = (
Expand Down Expand Up @@ -620,6 +626,8 @@ def _iter_grouped(self, warn_squeeze=True) -> Iterator[T_Xarray]:
yield self._obj.isel({self._group_dim: indices})

def _infer_concat_args(self, applied_example):
from xarray.core.groupers import BinGrouper

(grouper,) = self.groupers
if self._group_dim in applied_example.dims:
coord = grouper.group1d
Expand All @@ -628,7 +636,10 @@ def _infer_concat_args(self, applied_example):
coord = grouper.unique_coord
positions = None
(dim,) = coord.dims
if isinstance(coord, _DummyGroup):
if isinstance(grouper.group, _DummyGroup) and not isinstance(
grouper.grouper, BinGrouper
):
# When binning we actually do set the index
coord = None
coord = getattr(coord, "variable", coord)
return coord, dim, positions
Expand All @@ -641,6 +652,7 @@ def _binary_op(self, other, f, reflexive=False):

(grouper,) = self.groupers
obj = self._original_obj
name = grouper.name
group = grouper.group
codes = self._codes
dims = group.dims
Expand All @@ -649,9 +661,11 @@ def _binary_op(self, other, f, reflexive=False):
group = coord = group.to_dataarray()
else:
coord = grouper.unique_coord
if not isinstance(coord, DataArray):
coord = DataArray(grouper.unique_coord)
name = grouper.name
if isinstance(coord, Variable):
assert coord.ndim == 1
(coord_dim,) = coord.dims
# TODO: explicitly create Index here
coord = DataArray(coord, coords={coord_dim: coord.data})

if not isinstance(other, (Dataset, DataArray)):
raise TypeError(
Expand Down Expand Up @@ -766,6 +780,7 @@ def _flox_reduce(

obj = self._original_obj
(grouper,) = self.groupers
name = grouper.name
isbin = isinstance(grouper.grouper, BinGrouper)

if keep_attrs is None:
Expand Down Expand Up @@ -797,14 +812,14 @@ def _flox_reduce(
# weird backcompat
# reducing along a unique indexed dimension with squeeze=True
# should raise an error
if (dim is None or dim == grouper.name) and grouper.name in obj.xindexes:
index = obj.indexes[grouper.name]
if (dim is None or dim == name) and name in obj.xindexes:
index = obj.indexes[name]
if index.is_unique and self._squeeze:
raise ValueError(f"cannot reduce over dimensions {grouper.name!r}")
raise ValueError(f"cannot reduce over dimensions {name!r}")

unindexed_dims: tuple[Hashable, ...] = tuple()
if isinstance(grouper.group, _DummyGroup) and not isbin:
unindexed_dims = (grouper.name,)
unindexed_dims = (name,)

parsed_dim: tuple[Hashable, ...]
if isinstance(dim, str):
Expand Down Expand Up @@ -848,15 +863,19 @@ def _flox_reduce(
# in the grouped variable
group_dims = grouper.group.dims
if set(group_dims).issubset(set(parsed_dim)):
result[grouper.name] = output_index
result = result.assign_coords(
Coordinates(
coords={name: (name, np.array(output_index))},
indexes={name: PandasIndex(output_index, dim=name)},
)
)
result = result.drop_vars(unindexed_dims)

# broadcast and restore non-numeric data variables (backcompat)
for name, var in non_numeric.items():
if all(d not in var.dims for d in parsed_dim):
result[name] = var.variable.set_dims(
(grouper.name,) + var.dims,
(result.sizes[grouper.name],) + var.shape,
(name,) + var.dims, (result.sizes[name],) + var.shape
)

if not isinstance(result, Dataset):
Expand Down
Loading

0 comments on commit 4cfebee

Please sign in to comment.