Merge branch 'main' into groupby-shuffle

* main: GroupBy(chunked-array) (pydata#9522) DOC: mention attribute peculiarities in docs/docstrings (pydata#9700) add pydap-server dependencies to environment.yml (pydata#9709)
dcherian · Nov 4, 2024 · 47e5c17 · 47e5c17
2 parents 888e780 + a00bc91
commit 47e5c17
Show file tree

Hide file tree

Showing 14 changed files with 458 additions and 69 deletions.
diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml
@@ -36,6 +36,14 @@ dependencies:
   - pre-commit
   - pyarrow # pandas raises a deprecation warning without this, breaking doctests
   - pydap
+  # start pydap server dependencies, can be removed if pydap-server is available
+  - gunicorn
+  - PasteDeploy
+  - docopt-ng
+  - Webob
+  - Jinja2
+  - beautifulsoup4
+  # end pydap server dependencies
   - pytest
   - pytest-cov
   - pytest-env

diff --git a/doc/getting-started-guide/faq.rst b/doc/getting-started-guide/faq.rst
@@ -146,6 +146,9 @@ for conflicts between ``attrs`` when combining arrays and datasets, unless
 explicitly requested with the option ``compat='identical'``. The guiding
 principle is that metadata should not be allowed to get in the way.
 
+In general xarray uses the capabilities of the backends for reading and writing
+attributes. That has some implications on roundtripping. One example for such inconsistency is that size-1 lists will roundtrip as single element (for netcdf4 backends).
+
 What other netCDF related Python libraries should I know about?
 ---------------------------------------------------------------
 

diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst
@@ -40,7 +40,8 @@ alignment, building on the functionality of the ``index`` found on a pandas
 DataArray objects also can have a ``name`` and can hold arbitrary metadata in
 the form of their ``attrs`` property. Names and attributes are strictly for
 users and user-written code: xarray makes no attempt to interpret them, and
-propagates them only in unambiguous cases
+propagates them only in unambiguous cases. For reading and writing attributes
+xarray relies on the capabilities of the supported backends.
 (see FAQ, :ref:`approach to metadata`).
 
 .. _creating a dataarray:

diff --git a/doc/user-guide/groupby.rst b/doc/user-guide/groupby.rst
@@ -294,6 +294,15 @@ is identical to
     ds.resample(time=TimeResampler("ME"))
 
 
+The :py:class:`groupers.UniqueGrouper` accepts an optional ``labels`` kwarg that is not present
+in :py:meth:`DataArray.groupby` or :py:meth:`Dataset.groupby`.
+Specifying ``labels`` is required when grouping by a lazy array type (e.g. dask or cubed).
+The ``labels`` are used to construct the output coordinate (say for a reduction), and aggregations
+will only be run over the specified labels.
+You may use ``labels`` to also specify the ordering of groups to be used during iteration.
+The order will be preserved in the output.
+
+
 .. _groupby.multiple:
 
 Grouping by multiple variables

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -23,14 +23,21 @@ New Features
 ~~~~~~~~~~~~
 - Added :py:meth:`DataTree.persist` method (:issue:`9675`, :pull:`9682`).
   By `Sam Levang <https://github.com/slevang>`_.
+- Support lazy grouping by dask arrays, and allow specifying ordered groups with ``UniqueGrouper(labels=["a", "b", "c"])``
+  (:issue:`2852`, :issue:`757`).
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
 
 Deprecations
 ~~~~~~~~~~~~
-
+- Grouping by a chunked array (e.g. dask or cubed) currently eagerly loads that variable in to
+  memory. This behaviour is deprecated. If eager loading was intended, please load such arrays
+  manually using ``.load()`` or ``.compute()``. Else pass ``eagerly_compute_group=False``, and
+  provide expected group labels using the ``labels`` kwarg to a grouper object such as
+  :py:class:`grouper.UniqueGrouper` or :py:class:`grouper.BinGrouper`.
 
 Bug fixes
 ~~~~~~~~~
@@ -43,6 +50,9 @@ Bug fixes
 Documentation
 ~~~~~~~~~~~~~
 
+- Mention attribute peculiarities in docs/docstrings (:issue:`4798`, :pull:`9700`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
+
 
 Internal Changes
 ~~~~~~~~~~~~~~~~
@@ -91,14 +101,6 @@ New Features
   (:issue:`9427`, :pull: `9428`).
   By `Alfonso Ladino <https://github.com/aladinor>`_.
 
-Breaking changes
-~~~~~~~~~~~~~~~~
-
-
-Deprecations
-~~~~~~~~~~~~
-
-
 Bug fixes
 ~~~~~~~~~
 

diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -1156,7 +1156,7 @@ def _resample(
                 f"Received {type(freq)} instead."
             )
 
-        rgrouper = ResolvedGrouper(grouper, group, self)
+        rgrouper = ResolvedGrouper(grouper, group, self, eagerly_compute_group=False)
 
         return resample_cls(
             self,

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -347,6 +347,7 @@ class DataArray(
     attrs : dict_like or None, optional
         Attributes to assign to the new instance. By default, an empty
         attribute dictionary is initialized.
+        (see FAQ, :ref:`approach to metadata`)
     indexes : py:class:`~xarray.Indexes` or dict-like, optional
         For internal use only. For passing indexes objects to the
         new DataArray, use the ``coords`` argument instead with a
@@ -6747,6 +6748,7 @@ def groupby(
         *,
         squeeze: Literal[False] = False,
         restore_coord_dims: bool = False,
+        eagerly_compute_group: bool = True,
         **groupers: Grouper,
     ) -> DataArrayGroupBy:
         """Returns a DataArrayGroupBy object for performing grouped operations.
@@ -6762,6 +6764,11 @@ def groupby(
         restore_coord_dims : bool, default: False
             If True, also restore the dimension order of multi-dimensional
             coordinates.
+        eagerly_compute_group: bool
+            Whether to eagerly compute ``group`` when it is a chunked array.
+            This option is to maintain backwards compatibility. Set to False
+            to opt-in to future behaviour, where ``group`` is not automatically loaded
+            into memory.
         **groupers : Mapping of str to Grouper or Resampler
             Mapping of variable name to group by to :py:class:`Grouper` or :py:class:`Resampler` object.
             One of ``group`` or ``groupers`` must be provided.
@@ -6876,7 +6883,9 @@ def groupby(
         )
 
         _validate_groupby_squeeze(squeeze)
-        rgroupers = _parse_group_and_groupers(self, group, groupers)
+        rgroupers = _parse_group_and_groupers(
+            self, group, groupers, eagerly_compute_group=eagerly_compute_group
+        )
         return DataArrayGroupBy(self, rgroupers, restore_coord_dims=restore_coord_dims)
 
     @_deprecate_positional_args("v2024.07.0")
@@ -6891,6 +6900,7 @@ def groupby_bins(
         squeeze: Literal[False] = False,
         restore_coord_dims: bool = False,
         duplicates: Literal["raise", "drop"] = "raise",
+        eagerly_compute_group: bool = True,
     ) -> DataArrayGroupBy:
         """Returns a DataArrayGroupBy object for performing grouped operations.
 
@@ -6927,6 +6937,11 @@ def groupby_bins(
             coordinates.
         duplicates : {"raise", "drop"}, default: "raise"
             If bin edges are not unique, raise ValueError or drop non-uniques.
+        eagerly_compute_group: bool
+            Whether to eagerly compute ``group`` when it is a chunked array.
+            This option is to maintain backwards compatibility. Set to False
+            to opt-in to future behaviour, where ``group`` is not automatically loaded
+            into memory.
 
         Returns
         -------
@@ -6964,7 +6979,9 @@ def groupby_bins(
             precision=precision,
             include_lowest=include_lowest,
         )
-        rgrouper = ResolvedGrouper(grouper, group, self)
+        rgrouper = ResolvedGrouper(
+            grouper, group, self, eagerly_compute_group=eagerly_compute_group
+        )
 
         return DataArrayGroupBy(
             self,

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -597,6 +597,7 @@ class Dataset(
 
     attrs : dict-like, optional
         Global attributes to save on this dataset.
+        (see FAQ, :ref:`approach to metadata`)
 
     Examples
     --------
@@ -10403,6 +10404,7 @@ def groupby(
         *,
         squeeze: Literal[False] = False,
         restore_coord_dims: bool = False,
+        eagerly_compute_group: bool = True,
         **groupers: Grouper,
     ) -> DatasetGroupBy:
         """Returns a DatasetGroupBy object for performing grouped operations.
@@ -10418,6 +10420,11 @@ def groupby(
         restore_coord_dims : bool, default: False
             If True, also restore the dimension order of multi-dimensional
             coordinates.
+        eagerly_compute_group: bool
+            Whether to eagerly compute ``group`` when it is a chunked array.
+            This option is to maintain backwards compatibility. Set to False
+            to opt-in to future behaviour, where ``group`` is not automatically loaded
+            into memory.
         **groupers : Mapping of str to Grouper or Resampler
             Mapping of variable name to group by to :py:class:`Grouper` or :py:class:`Resampler` object.
             One of ``group`` or ``groupers`` must be provided.
@@ -10500,7 +10507,9 @@ def groupby(
         )
 
         _validate_groupby_squeeze(squeeze)
-        rgroupers = _parse_group_and_groupers(self, group, groupers)
+        rgroupers = _parse_group_and_groupers(
+            self, group, groupers, eagerly_compute_group=eagerly_compute_group
+        )
 
         return DatasetGroupBy(self, rgroupers, restore_coord_dims=restore_coord_dims)
 
@@ -10516,6 +10525,7 @@ def groupby_bins(
         squeeze: Literal[False] = False,
         restore_coord_dims: bool = False,
         duplicates: Literal["raise", "drop"] = "raise",
+        eagerly_compute_group: bool = True,
     ) -> DatasetGroupBy:
         """Returns a DatasetGroupBy object for performing grouped operations.
 
@@ -10552,6 +10562,11 @@ def groupby_bins(
             coordinates.
         duplicates : {"raise", "drop"}, default: "raise"
             If bin edges are not unique, raise ValueError or drop non-uniques.
+        eagerly_compute_group: bool
+            Whether to eagerly compute ``group`` when it is a chunked array.
+            This option is to maintain backwards compatibility. Set to False
+            to opt-in to future behaviour, where ``group`` is not automatically loaded
+            into memory.
 
         Returns
         -------
@@ -10589,7 +10604,9 @@ def groupby_bins(
             precision=precision,
             include_lowest=include_lowest,
         )
-        rgrouper = ResolvedGrouper(grouper, group, self)
+        rgrouper = ResolvedGrouper(
+            grouper, group, self, eagerly_compute_group=eagerly_compute_group
+        )
 
         return DatasetGroupBy(
             self,