Skip to content

Commit 0783df3

Browse files
committed
Merge branch 'main' into dask-datetime-to-numeric
* main: (24 commits) Fix overflow issue in decode_cf_datetime for dtypes <= np.uint32 (pydata#6598) Enable flox in GroupBy and resample (pydata#5734) Add setuptools as dependency in ASV benchmark CI (pydata#6609) change polyval dim ordering (pydata#6601) re-add timedelta support for polyval (pydata#6599) Minor Dataset.map docstr clarification (pydata#6595) New inline_array kwarg for open_dataset (pydata#6566) Fix polyval overloads (pydata#6593) Restore old MultiIndex dropping behaviour (pydata#6592) [docs] add Dataset.assign_coords example (pydata#6336) (pydata#6558) Fix zarr append dtype checks (pydata#6476) Add missing space in exception message (pydata#6590) Doc Link to accessors list in extending-xarray.rst (pydata#6587) Fix Dataset/DataArray.isel with drop=True and scalar DataArray indexes (pydata#6579) Add some warnings about rechunking to the docs (pydata#6569) [pre-commit.ci] pre-commit autoupdate (pydata#6584) terminology.rst: fix link to Unidata's "netcdf_dataset_components" (pydata#6583) Allow string formatting of scalar DataArrays (pydata#5981) Fix mypy issues & reenable in tests (pydata#6581) polyval: Use Horner's algorithm + support chunked inputs (pydata#6548) ...
2 parents 5cff4f1 + 8de7061 commit 0783df3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1912
-665
lines changed

.github/workflows/ci-additional.yaml

+3-4
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,7 @@ jobs:
7272
runs-on: "ubuntu-latest"
7373
needs: detect-ci-trigger
7474
# temporarily skipping due to https://github.com/pydata/xarray/issues/6551
75-
# if: needs.detect-ci-trigger.outputs.triggered == 'false'
76-
if: false
75+
if: needs.detect-ci-trigger.outputs.triggered == 'false'
7776
defaults:
7877
run:
7978
shell: bash -l {0}
@@ -105,10 +104,10 @@ jobs:
105104
- name: Install mypy
106105
run: |
107106
python -m pip install mypy
108-
python -m mypy --install-types --non-interactive
109107
110108
- name: Run mypy
111-
run: python -m mypy
109+
run: |
110+
python -m mypy --install-types --non-interactive
112111
113112
min-version-policy:
114113
name: Minimum Version Policy

.pre-commit-config.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ repos:
1919
hooks:
2020
- id: isort
2121
- repo: https://github.com/asottile/pyupgrade
22-
rev: v2.32.0
22+
rev: v2.32.1
2323
hooks:
2424
- id: pyupgrade
2525
args:
@@ -46,7 +46,7 @@ repos:
4646
# - id: velin
4747
# args: ["--write", "--compact"]
4848
- repo: https://github.com/pre-commit/mirrors-mypy
49-
rev: v0.942
49+
rev: v0.950
5050
hooks:
5151
- id: mypy
5252
# Copied from setup.cfg

asv_bench/asv.conf.json

+4
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,17 @@
5858
// "pip+emcee": [""], // emcee is only available for install with pip.
5959
// },
6060
"matrix": {
61+
"setuptools_scm[toml]": [""], // GH6609
62+
"setuptools_scm_git_archive": [""], // GH6609
6163
"numpy": [""],
6264
"pandas": [""],
6365
"netcdf4": [""],
6466
"scipy": [""],
6567
"bottleneck": [""],
6668
"dask": [""],
6769
"distributed": [""],
70+
"flox": [""],
71+
"numpy_groupies": [""],
6872
"sparse": [""]
6973
},
7074

asv_bench/benchmarks/groupby.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def setup(self, *args, **kwargs):
1313
{
1414
"a": xr.DataArray(np.r_[np.repeat(1, self.n), np.repeat(2, self.n)]),
1515
"b": xr.DataArray(np.arange(2 * self.n)),
16+
"c": xr.DataArray(np.arange(2 * self.n)),
1617
}
1718
)
1819
self.ds2d = self.ds1d.expand_dims(z=10)
@@ -50,10 +51,11 @@ class GroupByDask(GroupBy):
5051
def setup(self, *args, **kwargs):
5152
requires_dask()
5253
super().setup(**kwargs)
53-
self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2)).chunk({"dim_0": 50})
54-
self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)).chunk(
55-
{"dim_0": 50, "z": 5}
56-
)
54+
55+
self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2))
56+
self.ds1d["c"] = self.ds1d["c"].chunk({"dim_0": 50})
57+
self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2))
58+
self.ds2d["c"] = self.ds2d["c"].chunk({"dim_0": 50, "z": 5})
5759
self.ds1d_mean = self.ds1d.groupby("b").mean()
5860
self.ds2d_mean = self.ds2d.groupby("b").mean()
5961

asv_bench/benchmarks/polyfit.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import numpy as np
2+
3+
import xarray as xr
4+
5+
from . import parameterized, randn, requires_dask
6+
7+
NDEGS = (2, 5, 20)
8+
NX = (10**2, 10**6)
9+
10+
11+
class Polyval:
12+
def setup(self, *args, **kwargs):
13+
self.xs = {nx: xr.DataArray(randn((nx,)), dims="x", name="x") for nx in NX}
14+
self.coeffs = {
15+
ndeg: xr.DataArray(
16+
randn((ndeg,)), dims="degree", coords={"degree": np.arange(ndeg)}
17+
)
18+
for ndeg in NDEGS
19+
}
20+
21+
@parameterized(["nx", "ndeg"], [NX, NDEGS])
22+
def time_polyval(self, nx, ndeg):
23+
x = self.xs[nx]
24+
c = self.coeffs[ndeg]
25+
xr.polyval(x, c).compute()
26+
27+
@parameterized(["nx", "ndeg"], [NX, NDEGS])
28+
def peakmem_polyval(self, nx, ndeg):
29+
x = self.xs[nx]
30+
c = self.coeffs[ndeg]
31+
xr.polyval(x, c).compute()
32+
33+
34+
class PolyvalDask(Polyval):
35+
def setup(self, *args, **kwargs):
36+
requires_dask()
37+
super().setup(*args, **kwargs)
38+
self.xs = {k: v.chunk({"x": 10000}) for k, v in self.xs.items()}

ci/install-upstream-wheels.sh

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ conda uninstall -y --force \
1515
pint \
1616
bottleneck \
1717
sparse \
18+
flox \
1819
h5netcdf \
1920
xarray
2021
# to limit the runtime of Upstream CI
@@ -47,4 +48,5 @@ python -m pip install \
4748
git+https://github.com/pydata/sparse \
4849
git+https://github.com/intake/filesystem_spec \
4950
git+https://github.com/SciTools/nc-time-axis \
51+
git+https://github.com/dcherian/flox \
5052
git+https://github.com/h5netcdf/h5netcdf

ci/requirements/all-but-dask.yml

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ dependencies:
1313
- cfgrib
1414
- cftime
1515
- coveralls
16+
- flox
1617
- h5netcdf
1718
- h5py
1819
- hdf5

ci/requirements/environment-windows.yml

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ dependencies:
1010
- cftime
1111
- dask-core
1212
- distributed
13+
- flox
1314
- fsspec!=2021.7.0
1415
- h5netcdf
1516
- h5py

ci/requirements/environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ dependencies:
1212
- cftime
1313
- dask-core
1414
- distributed
15+
- flox
1516
- fsspec!=2021.7.0
1617
- h5netcdf
1718
- h5py

ci/requirements/min-all-deps.yml

+18-18
Original file line numberDiff line numberDiff line change
@@ -10,46 +10,46 @@ dependencies:
1010
- python=3.8
1111
- boto3=1.13
1212
- bottleneck=1.3
13-
# cartopy 0.18 conflicts with pynio
14-
- cartopy=0.17
13+
- cartopy=0.19
1514
- cdms2=3.1
1615
- cfgrib=0.9
17-
- cftime=1.2
16+
- cftime=1.4
1817
- coveralls
19-
- dask-core=2.30
20-
- distributed=2.30
21-
- h5netcdf=0.8
22-
- h5py=2.10
23-
# hdf5 1.12 conflicts with h5py=2.10
18+
- dask-core=2021.04
19+
- distributed=2021.04
20+
- flox=0.5
21+
- h5netcdf=0.11
22+
- h5py=3.1
23+
# hdf5 1.12 conflicts with h5py=3.1
2424
- hdf5=1.10
2525
- hypothesis
2626
- iris=2.4
2727
- lxml=4.6 # Optional dep of pydap
28-
- matplotlib-base=3.3
28+
- matplotlib-base=3.4
2929
- nc-time-axis=1.2
3030
# netcdf follows a 1.major.minor[.patch] convention
3131
# (see https://github.com/Unidata/netcdf4-python/issues/1090)
3232
# bumping the netCDF4 version is currently blocked by #4491
3333
- netcdf4=1.5.3
34-
- numba=0.51
35-
- numpy=1.18
34+
- numba=0.53
35+
- numpy=1.19
3636
- packaging=20.0
37-
- pandas=1.1
38-
- pint=0.16
37+
- pandas=1.2
38+
- pint=0.17
3939
- pip
4040
- pseudonetcdf=3.1
4141
- pydap=3.2
42-
- pynio=1.5
42+
# - pynio=1.5.5
4343
- pytest
4444
- pytest-cov
4545
- pytest-env
4646
- pytest-xdist
47-
- rasterio=1.1
48-
- scipy=1.5
47+
- rasterio=1.2
48+
- scipy=1.6
4949
- seaborn=0.11
50-
- sparse=0.11
50+
- sparse=0.12
5151
- toolz=0.11
5252
- typing_extensions=3.7
53-
- zarr=2.5
53+
- zarr=2.8
5454
- pip:
5555
- numbagg==0.1

doc/getting-started-guide/installing.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ Required dependencies
77
---------------------
88

99
- Python (3.8 or later)
10-
- `numpy <https://www.numpy.org/>`__ (1.18 or later)
10+
- `numpy <https://www.numpy.org/>`__ (1.19 or later)
1111
- `packaging <https://packaging.pypa.io/en/latest/#>`__ (20.0 or later)
12-
- `pandas <https://pandas.pydata.org/>`__ (1.1 or later)
12+
- `pandas <https://pandas.pydata.org/>`__ (1.2 or later)
1313

1414
.. _optional-dependencies:
1515

doc/internals/extending-xarray.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ on ways to write new accessors and the philosophy behind the approach, see
9292

9393
To help users keep things straight, please `let us know
9494
<https://github.com/pydata/xarray/issues>`_ if you plan to write a new accessor
95-
for an open source library. In the future, we will maintain a list of accessors
96-
and the libraries that implement them on this page.
95+
for an open source library. Existing open source accessors and the libraries
96+
that implement them are available in the list on the :ref:`ecosystem` page.
9797

9898
To make documenting accessors with ``sphinx`` and ``sphinx.ext.autosummary``
9999
easier, you can use `sphinx-autosummary-accessors`_.

doc/user-guide/dask.rst

+29-14
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ argument to :py:func:`~xarray.open_dataset` or using the
8484
8585
In this example ``latitude`` and ``longitude`` do not appear in the ``chunks``
8686
dict, so only one chunk will be used along those dimensions. It is also
87-
entirely equivalent to opening a dataset using :py:meth:`~xarray.open_dataset`
87+
entirely equivalent to opening a dataset using :py:func:`~xarray.open_dataset`
8888
and then chunking the data using the ``chunk`` method, e.g.,
8989
``xr.open_dataset('example-data.nc').chunk({'time': 10})``.
9090

@@ -95,13 +95,21 @@ use :py:func:`~xarray.open_mfdataset`::
9595

9696
This function will automatically concatenate and merge datasets into one in
9797
the simple cases that it understands (see :py:func:`~xarray.combine_by_coords`
98-
for the full disclaimer). By default, :py:meth:`~xarray.open_mfdataset` will chunk each
98+
for the full disclaimer). By default, :py:func:`~xarray.open_mfdataset` will chunk each
9999
netCDF file into a single Dask array; again, supply the ``chunks`` argument to
100100
control the size of the resulting Dask arrays. In more complex cases, you can
101-
open each file individually using :py:meth:`~xarray.open_dataset` and merge the result, as
102-
described in :ref:`combining data`. Passing the keyword argument ``parallel=True`` to :py:meth:`~xarray.open_mfdataset` will speed up the reading of large multi-file datasets by
101+
open each file individually using :py:func:`~xarray.open_dataset` and merge the result, as
102+
described in :ref:`combining data`. Passing the keyword argument ``parallel=True`` to
103+
:py:func:`~xarray.open_mfdataset` will speed up the reading of large multi-file datasets by
103104
executing those read tasks in parallel using ``dask.delayed``.
104105

106+
.. warning::
107+
108+
:py:func:`~xarray.open_mfdataset` called without ``chunks`` argument will return
109+
dask arrays with chunk sizes equal to the individual files. Re-chunking
110+
the dataset after creation with ``ds.chunk()`` will lead to an ineffective use of
111+
memory and is not recommended.
112+
105113
You'll notice that printing a dataset still shows a preview of array values,
106114
even if they are actually Dask arrays. We can do this quickly with Dask because
107115
we only need to compute the first few values (typically from the first block).
@@ -224,6 +232,7 @@ disk.
224232
available memory.
225233

226234
.. note::
235+
227236
For more on the differences between :py:meth:`~xarray.Dataset.persist` and
228237
:py:meth:`~xarray.Dataset.compute` see this `Stack Overflow answer <https://stackoverflow.com/questions/41806850/dask-difference-between-client-persist-and-client-compute>`_ and the `Dask documentation <https://distributed.dask.org/en/latest/manage-computation.html#dask-collections-to-futures>`_.
229238

@@ -236,6 +245,11 @@ sizes of Dask arrays is done with the :py:meth:`~xarray.Dataset.chunk` method:
236245
237246
rechunked = ds.chunk({"latitude": 100, "longitude": 100})
238247
248+
.. warning::
249+
250+
Rechunking an existing dask array created with :py:func:`~xarray.open_mfdataset`
251+
is not recommended (see above).
252+
239253
You can view the size of existing chunks on an array by viewing the
240254
:py:attr:`~xarray.Dataset.chunks` attribute:
241255

@@ -295,8 +309,7 @@ each block of your xarray object, you have three options:
295309
``apply_ufunc``
296310
~~~~~~~~~~~~~~~
297311

298-
Another option is to use xarray's :py:func:`~xarray.apply_ufunc`, which can
299-
automate `embarrassingly parallel
312+
:py:func:`~xarray.apply_ufunc` automates `embarrassingly parallel
300313
<https://en.wikipedia.org/wiki/Embarrassingly_parallel>`__ "map" type operations
301314
where a function written for processing NumPy arrays should be repeatedly
302315
applied to xarray objects containing Dask arrays. It works similarly to
@@ -542,18 +555,20 @@ larger chunksizes.
542555
Optimization Tips
543556
-----------------
544557

545-
With analysis pipelines involving both spatial subsetting and temporal resampling, Dask performance can become very slow in certain cases. Here are some optimization tips we have found through experience:
558+
With analysis pipelines involving both spatial subsetting and temporal resampling, Dask performance
559+
can become very slow or memory hungry in certain cases. Here are some optimization tips we have found
560+
through experience:
546561

547-
1. Do your spatial and temporal indexing (e.g. ``.sel()`` or ``.isel()``) early in the pipeline, especially before calling ``resample()`` or ``groupby()``. Grouping and resampling triggers some computation on all the blocks, which in theory should commute with indexing, but this optimization hasn't been implemented in Dask yet. (See `Dask issue #746 <https://github.com/dask/dask/issues/746>`_).
562+
1. Do your spatial and temporal indexing (e.g. ``.sel()`` or ``.isel()``) early in the pipeline, especially before calling ``resample()`` or ``groupby()``. Grouping and resampling triggers some computation on all the blocks, which in theory should commute with indexing, but this optimization hasn't been implemented in Dask yet. (See `Dask issue #746 <https://github.com/dask/dask/issues/746>`_). More generally, ``groupby()`` is a costly operation and does not (yet) perform well on datasets split across multiple files (see :pull:`5734` and linked discussions there).
548563

549564
2. Save intermediate results to disk as a netCDF files (using ``to_netcdf()``) and then load them again with ``open_dataset()`` for further computations. For example, if subtracting temporal mean from a dataset, save the temporal mean to disk before subtracting. Again, in theory, Dask should be able to do the computation in a streaming fashion, but in practice this is a fail case for the Dask scheduler, because it tries to keep every chunk of an array that it computes in memory. (See `Dask issue #874 <https://github.com/dask/dask/issues/874>`_)
550565

551-
3. Specify smaller chunks across space when using :py:meth:`~xarray.open_mfdataset` (e.g., ``chunks={'latitude': 10, 'longitude': 10}``). This makes spatial subsetting easier, because there's no risk you will load chunks of data referring to different chunks (probably not necessary if you follow suggestion 1).
566+
3. Specify smaller chunks across space when using :py:meth:`~xarray.open_mfdataset` (e.g., ``chunks={'latitude': 10, 'longitude': 10}``). This makes spatial subsetting easier, because there's no risk you will load subsets of data which span multiple chunks. On individual files, prefer to subset before chunking (suggestion 1).
567+
568+
4. Chunk as early as possible, and avoid rechunking as much as possible. Always pass the ``chunks={}`` argument to :py:func:`~xarray.open_mfdataset` to avoid redundant file reads.
552569

553-
4. Using the h5netcdf package by passing ``engine='h5netcdf'`` to :py:meth:`~xarray.open_mfdataset`
554-
can be quicker than the default ``engine='netcdf4'`` that uses the netCDF4 package.
570+
5. Using the h5netcdf package by passing ``engine='h5netcdf'`` to :py:meth:`~xarray.open_mfdataset` can be quicker than the default ``engine='netcdf4'`` that uses the netCDF4 package.
555571

556-
5. Some dask-specific tips may be found `here <https://docs.dask.org/en/latest/array-best-practices.html>`_.
572+
6. Some dask-specific tips may be found `here <https://docs.dask.org/en/latest/array-best-practices.html>`_.
557573

558-
6. The dask `diagnostics <https://docs.dask.org/en/latest/understanding-performance.html>`_ can be
559-
useful in identifying performance bottlenecks.
574+
7. The dask `diagnostics <https://docs.dask.org/en/latest/understanding-performance.html>`_ can be useful in identifying performance bottlenecks.

doc/user-guide/terminology.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ complete examples, please consult the relevant documentation.*
2727

2828
Variable
2929
A `NetCDF-like variable
30-
<https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_set_components.html#variables>`_
30+
<https://docs.unidata.ucar.edu/nug/current/netcdf_data_set_components.html#variables>`_
3131
consisting of dimensions, data, and attributes which describe a single
3232
array. The main functional difference between variables and numpy arrays
3333
is that numerical operations on variables implement array broadcasting

0 commit comments

Comments
 (0)