Merge remote-tracking branch 'origin/main' into backend-indexing

andersy005 · andersy005 · commit 79cfe4dcd8a8 · 2024-04-30T16:21:48.000-07:00
* origin/main: clean up the upstream-dev setup script (#8986) Skip flaky `test_open_mfdataset_manyfiles` test (#8989) Remove `.drop` warning allow (#8988) Add notes on when to add ignores to warnings (#8987) Docstring and documentation improvement for the Dataset class (#8973)
diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh
@@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 
-# install cython for building cftime without build isolation
-micromamba install "cython>=0.29.20" py-cpuinfo setuptools-scm
 # temporarily (?) remove numbagg and numba
 micromamba remove -y numba numbagg sparse
 # temporarily remove numexpr
@@ -18,10 +16,9 @@ micromamba remove -y --force \
     zarr \
     cftime \
     packaging \
-    pint \
     bottleneck \
-    flox \
-    numcodecs
+    flox
+    # pint
 # to limit the runtime of Upstream CI
 python -m pip install \
     -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \
@@ -42,32 +39,17 @@ python -m pip install \
     --pre \
     --upgrade \
     pyarrow
-# without build isolation for packages compiling against numpy
-# TODO: remove once there are `numpy>=2.0` builds for these
-python -m pip install \
-    --no-deps \
-    --upgrade \
-    --no-build-isolation \
-    git+https://github.com/Unidata/cftime
-python -m pip install \
-    --no-deps \
-    --upgrade \
-    --no-build-isolation \
-    git+https://github.com/zarr-developers/numcodecs
-python -m pip install \
-    --no-deps \
-    --upgrade \
-    --no-build-isolation \
-    git+https://github.com/pydata/bottleneck
 python -m pip install \
     --no-deps \
     --upgrade \
     git+https://github.com/dask/dask \
     git+https://github.com/dask/dask-expr \
     git+https://github.com/dask/distributed \
     git+https://github.com/zarr-developers/zarr \
+    git+https://github.com/Unidata/cftime \
     git+https://github.com/pypa/packaging \
     git+https://github.com/hgrecco/pint \
+    git+https://github.com/pydata/bottleneck \
     git+https://github.com/intake/filesystem_spec \
     git+https://github.com/SciTools/nc-time-axis \
     git+https://github.com/xarray-contrib/flox \
diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst
@@ -282,27 +282,40 @@ variables (``data_vars``), coordinates (``coords``) and attributes (``attrs``).
 
 - ``attrs`` should be a dictionary.
 
-Let's create some fake data for the example we show above:
+Let's create some fake data for the example we show above. In this
+example dataset, we will represent measurements of the temperature and
+pressure that were made under various conditions:
+
+* the measurements were made on four different days;
+* they were made at two separate locations, which we will represent using
+  their latitude and longitude; and
+* they were made using instruments by three different manufacutrers, which we
+  will refer to as `'manufac1'`, `'manufac2'`, and `'manufac3'`.
 
 .. ipython:: python
 
-    temp = 15 + 8 * np.random.randn(2, 2, 3)
-    precip = 10 * np.random.rand(2, 2, 3)
-    lon = [[-99.83, -99.32], [-99.79, -99.23]]
-    lat = [[42.25, 42.21], [42.63, 42.59]]
+    np.random.seed(0)
+    temperature = 15 + 8 * np.random.randn(2, 3, 4)
+    precipitation = 10 * np.random.rand(2, 3, 4)
+    lon = [-99.83, -99.32]
+    lat = [42.25, 42.21]
+    instruments = ["manufac1", "manufac2", "manufac3"]
+    time = pd.date_range("2014-09-06", periods=4)
+    reference_time = pd.Timestamp("2014-09-05")
 
     # for real use cases, its good practice to supply array attributes such as
     # units, but we won't bother here for the sake of brevity
     ds = xr.Dataset(
         {
-            "temperature": (["x", "y", "time"], temp),
-            "precipitation": (["x", "y", "time"], precip),
+            "temperature": (["loc", "instrument", "time"], temperature),
+            "precipitation": (["loc", "instrument", "time"], precipitation),
         },
         coords={
-            "lon": (["x", "y"], lon),
-            "lat": (["x", "y"], lat),
-            "time": pd.date_range("2014-09-06", periods=3),
-            "reference_time": pd.Timestamp("2014-09-05"),
+            "lon": (["loc"], lon),
+            "lat": (["loc"], lat),
+            "instrument": instruments,
+            "time": time,
+            "reference_time": reference_time,
         },
     )
     ds
@@ -387,12 +400,12 @@ example, to create this example dataset from scratch, we could have written:
 .. ipython:: python
 
     ds = xr.Dataset()
-    ds["temperature"] = (("x", "y", "time"), temp)
-    ds["temperature_double"] = (("x", "y", "time"), temp * 2)
-    ds["precipitation"] = (("x", "y", "time"), precip)
-    ds.coords["lat"] = (("x", "y"), lat)
-    ds.coords["lon"] = (("x", "y"), lon)
-    ds.coords["time"] = pd.date_range("2014-09-06", periods=3)
+    ds["temperature"] = (("loc", "instrument", "time"), temperature)
+    ds["temperature_double"] = (("loc", "instrument", "time"), temperature * 2)
+    ds["precipitation"] = (("loc", "instrument", "time"), precipitation)
+    ds.coords["lat"] = (("loc",), lat)
+    ds.coords["lon"] = (("loc",), lon)
+    ds.coords["time"] = pd.date_range("2014-09-06", periods=4)
     ds.coords["reference_time"] = pd.Timestamp("2014-09-05")
 
 To change the variables in a ``Dataset``, you can use all the standard dictionary
@@ -452,8 +465,8 @@ follow nested function calls:
 
     # these lines are equivalent, but with pipe we can make the logic flow
     # entirely from left to right
-    plt.plot((2 * ds.temperature.sel(x=0)).mean("y"))
-    (ds.temperature.sel(x=0).pipe(lambda x: 2 * x).mean("y").pipe(plt.plot))
+    plt.plot((2 * ds.temperature.sel(loc=0)).mean("instrument"))
+    (ds.temperature.sel(loc=0).pipe(lambda x: 2 * x).mean("instrument").pipe(plt.plot))
 
 Both ``pipe`` and ``assign`` replicate the pandas methods of the same names
 (:py:meth:`DataFrame.pipe <pandas.DataFrame.pipe>` and
@@ -479,7 +492,7 @@ dimension and non-dimension variables:
 
 .. ipython:: python
 
-    ds.coords["day"] = ("time", [6, 7, 8])
+    ds.coords["day"] = ("time", [6, 7, 8, 9])
     ds.swap_dims({"time": "day"})
 
 .. _coordinates:
diff --git a/pyproject.toml b/pyproject.toml
@@ -288,18 +288,22 @@ addopts = ["--strict-config", "--strict-markers"]
 # - Converts any warning from xarray into an error
 # - Allows some warnings ("default") which the test suite currently raises,
 #   since it wasn't practical to fix them all before merging this config. The
-#   arnings are still listed in CI (since it uses `default`, not `ignore`).
+#   warnings are reported in CI (since it uses `default`, not `ignore`).
 #
-# We can remove these rules allowing warnings; a valued contribution is removing
-# a line, seeing what breaks, and then fixing the library code or tests so that
-# it doesn't raise warnings.
+# Over time, we can remove these rules allowing warnings. A valued contribution
+# is removing a line, seeing what breaks, and then fixing the library code or
+# tests so that it doesn't raise warnings.
 #
-# While we only raise an error on warnings from within xarray, if dependency
-# raises a warning with a stacklevel such that it's interpreted to be raised
-# from xarray, please feel free to add a rule switching it to `default` here.
-#
-# If these settings get in the way of making progress, it's also acceptable to
-# temporarily add additional ignores.
+# There are some instance where we'll want to add to these rules:
+# - While we only raise errors on warnings from within xarray, a dependency can
+#   raise a warning with a stacklevel such that it's interpreted to be raised
+#   from xarray and this will mistakenly convert it to an error. If that
+#   happens, please feel free to add a rule switching it to `default` here, and
+#   disabling the error.
+# - If these settings get in the way of making progress, it's also acceptable to
+#   temporarily add additional `default` rules.
+# - But we should only add `ignore` rules if we're confident that we'll never
+#   need to address a warning.
 
 filterwarnings = [
   "error:::xarray.*",
@@ -315,7 +319,6 @@ filterwarnings = [
   "default:deallocating CachingFileManager:RuntimeWarning:xarray.backends.netCDF4_",
   "default:deallocating CachingFileManager:RuntimeWarning:xarray.core.indexing",
   "default:Failed to decode variable.*NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays:DeprecationWarning",
-  "default:dropping variables using `drop` is deprecated; use drop_vars:DeprecationWarning:xarray.tests.test_groupby",
   "default:The `interpolation` argument to quantile was renamed to `method`:FutureWarning:xarray.*",
   "default:invalid value encountered in cast:RuntimeWarning:xarray.core.duck_array_ops",
   "default:invalid value encountered in cast:RuntimeWarning:xarray.conventions",
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -590,60 +590,75 @@ class Dataset(
 
     Examples
     --------
-    Create data:
+    In this example dataset, we will represent measurements of the temperature
+    and pressure that were made under various conditions:
+
+    * the measurements were made on four different days;
+    * they were made at two separate locations, which we will represent using
+      their latitude and longitude; and
+    * they were made using three instrument developed by three different
+      manufacturers, which we will refer to using the strings `'manufac1'`,
+      `'manufac2'`, and `'manufac3'`.
 
     >>> np.random.seed(0)
-    >>> temperature = 15 + 8 * np.random.randn(2, 2, 3)
-    >>> precipitation = 10 * np.random.rand(2, 2, 3)
-    >>> lon = [[-99.83, -99.32], [-99.79, -99.23]]
-    >>> lat = [[42.25, 42.21], [42.63, 42.59]]
-    >>> time = pd.date_range("2014-09-06", periods=3)
+    >>> temperature = 15 + 8 * np.random.randn(2, 3, 4)
+    >>> precipitation = 10 * np.random.rand(2, 3, 4)
+    >>> lon = [-99.83, -99.32]
+    >>> lat = [42.25, 42.21]
+    >>> instruments = ["manufac1", "manufac2", "manufac3"]
+    >>> time = pd.date_range("2014-09-06", periods=4)
     >>> reference_time = pd.Timestamp("2014-09-05")
 
-    Initialize a dataset with multiple dimensions:
+    Here, we initialize the dataset with multiple dimensions. We use the string
+    `"loc"` to represent the location dimension of the data, the string
+    `"instrument"` to represent the instrument manufacturer dimension, and the
+    string `"time"` for the time dimension.
 
     >>> ds = xr.Dataset(
     ...     data_vars=dict(
-    ...         temperature=(["x", "y", "time"], temperature),
-    ...         precipitation=(["x", "y", "time"], precipitation),
+    ...         temperature=(["loc", "instrument", "time"], temperature),
+    ...         precipitation=(["loc", "instrument", "time"], precipitation),
     ...     ),
     ...     coords=dict(
-    ...         lon=(["x", "y"], lon),
-    ...         lat=(["x", "y"], lat),
+    ...         lon=("loc", lon),
+    ...         lat=("loc", lat),
+    ...         instrument=instruments,
     ...         time=time,
     ...         reference_time=reference_time,
     ...     ),
     ...     attrs=dict(description="Weather related data."),
     ... )
     >>> ds
-    <xarray.Dataset> Size: 288B
-    Dimensions:         (x: 2, y: 2, time: 3)
+    <xarray.Dataset> Size: 552B
+    Dimensions:         (loc: 2, instrument: 3, time: 4)
     Coordinates:
-        lon             (x, y) float64 32B -99.83 -99.32 -99.79 -99.23
-        lat             (x, y) float64 32B 42.25 42.21 42.63 42.59
-      * time            (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08
+        lon             (loc) float64 16B -99.83 -99.32
+        lat             (loc) float64 16B 42.25 42.21
+      * instrument      (instrument) <U8 96B 'manufac1' 'manufac2' 'manufac3'
+      * time            (time) datetime64[ns] 32B 2014-09-06 ... 2014-09-09
         reference_time  datetime64[ns] 8B 2014-09-05
-    Dimensions without coordinates: x, y
+    Dimensions without coordinates: loc
     Data variables:
-        temperature     (x, y, time) float64 96B 29.11 18.2 22.83 ... 16.15 26.63
-        precipitation   (x, y, time) float64 96B 5.68 9.256 0.7104 ... 4.615 7.805
+        temperature     (loc, instrument, time) float64 192B 29.11 18.2 ... 9.063
+        precipitation   (loc, instrument, time) float64 192B 4.562 5.684 ... 1.613
     Attributes:
         description:  Weather related data.
 
     Find out where the coldest temperature was and what values the
     other variables had:
 
     >>> ds.isel(ds.temperature.argmin(...))
-    <xarray.Dataset> Size: 48B
+    <xarray.Dataset> Size: 80B
     Dimensions:         ()
     Coordinates:
         lon             float64 8B -99.32
         lat             float64 8B 42.21
-        time            datetime64[ns] 8B 2014-09-08
+        instrument      <U8 32B 'manufac3'
+        time            datetime64[ns] 8B 2014-09-06
         reference_time  datetime64[ns] 8B 2014-09-05
     Data variables:
-        temperature     float64 8B 7.182
-        precipitation   float64 8B 8.326
+        temperature     float64 8B -5.424
+        precipitation   float64 8B 9.884
     Attributes:
         description:  Weather related data.
 
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -3813,11 +3813,11 @@ def skip_if_not_engine(engine):
         pytest.importorskip(engine)
 
 
-# Flaky test. Very open to contributions on fixing this
 @requires_dask
 @pytest.mark.filterwarnings("ignore:use make_scale(name) instead")
-@pytest.mark.xfail(reason="Flaky test. Very open to contributions on fixing this")
-@pytest.mark.skipif(ON_WINDOWS, reason="Skipping on Windows")
+@pytest.mark.skip(
+    reason="Flaky test which can cause the worker to crash (so don't xfail). Very open to contributions fixing this"
+)
 def test_open_mfdataset_manyfiles(
     readengine, nfiles, parallel, chunks, file_cache_maxsize
 ):