diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 7969847c61f..886bcfbd548 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -28,8 +28,11 @@ jobs: environment-name: xarray-tests cache-environment: true cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" + # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385 create-args: >- asv + build + mamba - name: Run benchmarks @@ -47,9 +50,6 @@ jobs: asv machine --yes echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})" - # Use mamba for env creation - # export CONDA_EXE=$(which mamba) - export CONDA_EXE=$(which conda) # Run benchmarks for current commit against base ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR" asv continuous $ASV_OPTIONS ${{ github.event.pull_request.base.sha }} ${GITHUB_SHA} \ diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index e48fdbcabd7..48adfdd9096 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -8,6 +8,13 @@ on: branches: - "main" - "backend-indexing" + paths: + - 'ci/**' + - '.github/**' + - '/*' # covers files such as `pyproject.toml` + - 'properties/**' + - 'xarray/**' + - "backend-indexing" workflow_dispatch: # allows you to trigger manually concurrency: @@ -129,7 +136,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/ - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: mypy_report/cobertura.xml flags: mypy @@ -183,7 +190,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/ - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: mypy_report/cobertura.xml flags: mypy39 @@ -244,7 +251,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: pyright_report/cobertura.xml flags: pyright @@ -303,7 +310,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: pyright_report/cobertura.xml flags: pyright39 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c516ccf7ce6..4263c313cbc 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -8,6 +8,12 @@ on: branches: - "main" - "backend-indexing" + paths: + - 'ci/**' + - '.github/**' + - '/*' # covers files such as `pyproject.toml` + - 'properties/**' + - 'xarray/**' workflow_dispatch: # allows you to trigger manually concurrency: @@ -158,7 +164,7 @@ jobs: path: pytest.xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: ./coverage.xml flags: unittests diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 4f3d199dd2d..a216dfd7428 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -143,7 +143,7 @@ jobs: run: | python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: mypy_report/cobertura.xml flags: mypy diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index a709d0a51a7..9dc86df712d 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -29,7 +29,7 @@ // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. - "environment_type": "conda", + "environment_type": "mamba", "conda_channels": ["conda-forge"], // timeout in seconds for installing any dependencies in environment @@ -41,7 +41,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - "pythons": ["3.10"], + "pythons": ["3.11"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty @@ -72,8 +72,12 @@ "sparse": [""], "cftime": [""] }, - - + // fix for bad builds + // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185 + "build_command": [ + "python -m build", + "python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" + ], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 1b3e55fa659..065c1b3b17f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -68,6 +68,7 @@ def setup(self, *args, **kwargs): self.ds2d_mean = self.ds2d.groupby("b").mean().compute() +# TODO: These don't work now because we are calling `.compute` explicitly. class GroupByPandasDataFrame(GroupBy): """Run groupby tests using pandas DataFrame.""" @@ -111,11 +112,11 @@ def setup(self, *args, **kwargs): { "b": ("time", np.arange(365.0 * 24)), }, - coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)}, + coords={"time": pd.date_range("2001-01-01", freq="h", periods=365 * 24)}, ) self.ds2d = self.ds1d.expand_dims(z=10) - self.ds1d_mean = self.ds1d.resample(time="48H").mean() - self.ds2d_mean = self.ds2d.resample(time="48H").mean() + self.ds1d_mean = self.ds1d.resample(time="48h").mean() + self.ds2d_mean = self.ds2d.resample(time="48h").mean() @parameterized(["ndim"], [(1, 2)]) def time_init(self, ndim): @@ -127,7 +128,7 @@ def time_init(self, ndim): def time_agg_small_num_groups(self, method, ndim, use_flox): ds = getattr(self, f"ds{ndim}d") with xr.set_options(use_flox=use_flox): - getattr(ds.resample(time="3M"), method)().compute() + getattr(ds.resample(time="3ME"), method)().compute() @parameterized( ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] @@ -135,7 +136,7 @@ def time_agg_small_num_groups(self, method, ndim, use_flox): def time_agg_large_num_groups(self, method, ndim, use_flox): ds = getattr(self, f"ds{ndim}d") with xr.set_options(use_flox=use_flox): - getattr(ds.resample(time="48H"), method)().compute() + getattr(ds.resample(time="48h"), method)().compute() class ResampleDask(Resample): @@ -154,13 +155,13 @@ def setup(self, *args, **kwargs): }, coords={ "time": xr.date_range( - "2001-01-01", freq="H", periods=365 * 24, calendar="noleap" + "2001-01-01", freq="h", periods=365 * 24, calendar="noleap" ) }, ) self.ds2d = self.ds1d.expand_dims(z=10) - self.ds1d_mean = self.ds1d.resample(time="48H").mean() - self.ds2d_mean = self.ds2d.resample(time="48H").mean() + self.ds1d_mean = self.ds1d.resample(time="48h").mean() + self.ds2d_mean = self.ds2d.resample(time="48h").mean() @parameterized(["use_cftime", "use_flox"], [[True, False], [True, False]]) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 169c8af06e9..529d023daa8 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -12,12 +12,14 @@ nt = 500 basic_indexes = { + "1scalar": {"x": 0}, "1slice": {"x": slice(0, 3)}, "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)}, "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)}, } basic_assignment_values = { + "1scalar": 0, "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]), "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]), "2slicess-1scalar": xr.DataArray( @@ -74,6 +76,10 @@ def setup(self, key): "x_coords": ("x", np.linspace(1.1, 2.1, nx)), }, ) + # Benchmark how indexing is slowed down by adding many scalar variable + # to the dataset + # https://github.com/pydata/xarray/pull/9003 + self.ds_large = self.ds.merge({f"extra_var{i}": i for i in range(400)}) class Indexing(Base): @@ -89,6 +95,11 @@ def time_indexing_outer(self, key): def time_indexing_vectorized(self, key): self.ds.isel(**vectorized_indexes[key]).load() + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_indexing_basic_ds_large(self, key): + # https://github.com/pydata/xarray/pull/9003 + self.ds_large.isel(**basic_indexes[key]).load() + class Assignment(Base): @parameterized(["key"], [list(basic_indexes.keys())]) diff --git a/ci/min_deps_check.py b/ci/min_deps_check.py index 48ea323ed81..5ec7bff0a30 100755 --- a/ci/min_deps_check.py +++ b/ci/min_deps_check.py @@ -133,7 +133,7 @@ def process_pkg( - publication date of version suggested by policy (YYYY-MM-DD) - status ("<", "=", "> (!)") """ - print("Analyzing %s..." % pkg) + print(f"Analyzing {pkg}...") versions = query_conda(pkg) try: diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 63bf8b80d81..b73d0fdcb51 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -874,7 +874,7 @@ and then calling ``to_zarr`` with ``compute=False`` to write only metadata # The values of this dask array are entirely irrelevant; only the dtype, # shape and chunks are used dummies = dask.array.zeros(30, chunks=10) - ds = xr.Dataset({"foo": ("x", dummies)}) + ds = xr.Dataset({"foo": ("x", dummies)}, coords={"x": np.arange(30)}) path = "path/to/directory.zarr" # Now we write the metadata without computing any array values ds.to_zarr(path, compute=False) @@ -890,7 +890,7 @@ where the data should be written (in index space, not label space), e.g., # For convenience, we'll slice a single dataset, but in the real use-case # we would create them separately possibly even from separate processes. - ds = xr.Dataset({"foo": ("x", np.arange(30))}) + ds = xr.Dataset({"foo": ("x", np.arange(30))}, coords={"x": np.arange(30)}) # Any of the following region specifications are valid ds.isel(x=slice(0, 10)).to_zarr(path, region="auto") ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"}) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0f79b648187..378e6330352 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,8 +29,13 @@ New Features for example, will retain the object. However, one cannot do operations that are not possible on the `ExtensionArray` then, such as broadcasting. By `Ilan Gold `_. +- :py:func:`testing.assert_allclose`/:py:func:`testing.assert_equal` now accept a new argument `check_dims="transpose"`, controlling whether a transposed array is considered equal. (:issue:`5733`, :pull:`8991`) + By `Ignacio Martinez Vazquez `_. - Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg - `create_index=False`. (:pull:`8960`) + `create_index_for_new_dim=False`. (:pull:`8960`) + By `Tom Nicholas `_. +- Avoid automatically re-creating 1D pandas indexes in :py:func:`concat()`. Also added option to avoid creating 1D indexes for + new dimension coordinates by passing the new kwarg `create_index_for_new_dim=False`. (:issue:`8871`, :pull:`8872`) By `Tom Nicholas `_. Breaking changes @@ -58,6 +63,12 @@ Breaking changes Bug fixes ~~~~~~~~~ +- Following `an upstream bug fix + `_ to + :py:func:`pandas.date_range`, date ranges produced by + :py:func:`xarray.cftime_range` with negative frequencies will now fall fully + within the bounds of the provided start and end dates (:pull:`8999`). By + `Spencer Clark `_. Internal Changes diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 62085fe5e2a..c9a8630a575 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -27,7 +27,6 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler -from xarray.backends.zarr import open_zarr from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -1522,92 +1521,6 @@ def save_mfdataset( ) -def _auto_detect_region(ds_new, ds_orig, dim): - # Create a mapping array of coordinates to indices on the original array - coord = ds_orig[dim] - da_map = DataArray(np.arange(coord.size), coords={dim: coord}) - - try: - da_idxs = da_map.sel({dim: ds_new[dim]}) - except KeyError as e: - if "not all values found" in str(e): - raise KeyError( - f"Not all values of coordinate '{dim}' in the new array were" - " found in the original store. Writing to a zarr region slice" - " requires that no dimensions or metadata are changed by the write." - ) - else: - raise e - - if (da_idxs.diff(dim) != 1).any(): - raise ValueError( - f"The auto-detected region of coordinate '{dim}' for writing new data" - " to the original store had non-contiguous indices. Writing to a zarr" - " region slice requires that the new data constitute a contiguous subset" - " of the original store." - ) - - dim_slice = slice(da_idxs.values[0], da_idxs.values[-1] + 1) - - return dim_slice - - -def _auto_detect_regions(ds, region, open_kwargs): - ds_original = open_zarr(**open_kwargs) - for key, val in region.items(): - if val == "auto": - region[key] = _auto_detect_region(ds, ds_original, key) - return region - - -def _validate_and_autodetect_region(ds, region, mode, open_kwargs) -> dict[str, slice]: - if region == "auto": - region = {dim: "auto" for dim in ds.dims} - - if not isinstance(region, dict): - raise TypeError(f"``region`` must be a dict, got {type(region)}") - - if any(v == "auto" for v in region.values()): - if mode != "r+": - raise ValueError( - f"``mode`` must be 'r+' when using ``region='auto'``, got {mode}" - ) - region = _auto_detect_regions(ds, region, open_kwargs) - - for k, v in region.items(): - if k not in ds.dims: - raise ValueError( - f"all keys in ``region`` are not in Dataset dimensions, got " - f"{list(region)} and {list(ds.dims)}" - ) - if not isinstance(v, slice): - raise TypeError( - "all values in ``region`` must be slice objects, got " - f"region={region}" - ) - if v.step not in {1, None}: - raise ValueError( - "step on all slices in ``region`` must be 1 or None, got " - f"region={region}" - ) - - non_matching_vars = [ - k for k, v in ds.variables.items() if not set(region).intersection(v.dims) - ] - if non_matching_vars: - raise ValueError( - f"when setting `region` explicitly in to_zarr(), all " - f"variables in the dataset to write must have at least " - f"one dimension in common with the region's dimensions " - f"{list(region.keys())}, but that is not " - f"the case for some variables here. To drop these variables " - f"from this dataset before exporting to zarr, write: " - f".drop_vars({non_matching_vars!r})" - ) - - return region - - def _validate_datatypes_for_zarr_append(zstore, dataset): """If variable exists in the store, confirm dtype of the data to append is compatible with existing dtype. @@ -1768,24 +1681,6 @@ def to_zarr( # validate Dataset keys, DataArray names _validate_dataset_names(dataset) - if region is not None: - open_kwargs = dict( - store=store, - synchronizer=synchronizer, - group=group, - consolidated=consolidated, - storage_options=storage_options, - zarr_version=zarr_version, - ) - region = _validate_and_autodetect_region(dataset, region, mode, open_kwargs) - # can't modify indexed with region writes - dataset = dataset.drop_vars(dataset.indexes) - if append_dim is not None and append_dim in region: - raise ValueError( - f"cannot list the same dimension in both ``append_dim`` and " - f"``region`` with to_zarr(), got {append_dim} in both" - ) - if zarr_version is None: # default to 2 if store doesn't specify it's version (e.g. a path) zarr_version = int(getattr(store, "_store_version", 2)) @@ -1815,6 +1710,16 @@ def to_zarr( write_empty=write_empty_chunks, ) + if region is not None: + zstore._validate_and_autodetect_region(dataset) + # can't modify indexed with region writes + dataset = dataset.drop_vars(dataset.indexes) + if append_dim is not None and append_dim in region: + raise ValueError( + f"cannot list the same dimension in both ``append_dim`` and " + f"``region`` with to_zarr(), got {append_dim} in both" + ) + if mode in ["a", "a-", "r+"]: _validate_datatypes_for_zarr_append(zstore, dataset) if append_dim is not None: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 69f8b56c083..4c2e8be0c16 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any import numpy as np +import pandas as pd from xarray import coding, conventions from xarray.backends.common import ( @@ -522,7 +523,9 @@ def ds(self): # TODO: consider deprecating this in favor of zarr_group return self.zarr_group - def open_store_variable(self, name, zarr_array): + def open_store_variable(self, name, zarr_array=None): + if zarr_array is None: + zarr_array = self.zarr_group[name] data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array)) try_nczarr = self._mode == "r" dimensions, attributes = _get_zarr_dims_and_attrs( @@ -636,11 +639,7 @@ def store( # avoid needing to load index variables into memory. # TODO: consider making loading indexes lazy again? existing_vars, _, _ = conventions.decode_cf_variables( - { - k: v - for k, v in self.get_variables().items() - if k in existing_variable_names - }, + {k: self.open_store_variable(name=k) for k in existing_variable_names}, self.get_attrs(), ) # Modified variables must use the same encoding as the store. @@ -809,10 +808,93 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No region = tuple(write_region[dim] for dim in dims) writer.add(v.data, zarr_array, region) - def close(self): + def close(self) -> None: if self._close_store_on_close: self.zarr_group.store.close() + def _auto_detect_regions(self, ds, region): + for dim, val in region.items(): + if val != "auto": + continue + + if dim not in ds._variables: + # unindexed dimension + region[dim] = slice(0, ds.sizes[dim]) + continue + + variable = conventions.decode_cf_variable( + dim, self.open_store_variable(dim).compute() + ) + assert variable.dims == (dim,) + index = pd.Index(variable.data) + idxs = index.get_indexer(ds[dim].data) + if any(idxs == -1): + raise KeyError( + f"Not all values of coordinate '{dim}' in the new array were" + " found in the original store. Writing to a zarr region slice" + " requires that no dimensions or metadata are changed by the write." + ) + + if (np.diff(idxs) != 1).any(): + raise ValueError( + f"The auto-detected region of coordinate '{dim}' for writing new data" + " to the original store had non-contiguous indices. Writing to a zarr" + " region slice requires that the new data constitute a contiguous subset" + " of the original store." + ) + region[dim] = slice(idxs[0], idxs[-1] + 1) + return region + + def _validate_and_autodetect_region(self, ds) -> None: + region = self._write_region + + if region == "auto": + region = {dim: "auto" for dim in ds.dims} + + if not isinstance(region, dict): + raise TypeError(f"``region`` must be a dict, got {type(region)}") + if any(v == "auto" for v in region.values()): + if self._mode != "r+": + raise ValueError( + f"``mode`` must be 'r+' when using ``region='auto'``, got {self._mode!r}" + ) + region = self._auto_detect_regions(ds, region) + + # validate before attempting to auto-detect since the auto-detection + # should always return a valid slice. + for k, v in region.items(): + if k not in ds.dims: + raise ValueError( + f"all keys in ``region`` are not in Dataset dimensions, got " + f"{list(region)} and {list(ds.dims)}" + ) + if not isinstance(v, slice): + raise TypeError( + "all values in ``region`` must be slice objects, got " + f"region={region}" + ) + if v.step not in {1, None}: + raise ValueError( + "step on all slices in ``region`` must be 1 or None, got " + f"region={region}" + ) + + non_matching_vars = [ + k for k, v in ds.variables.items() if not set(region).intersection(v.dims) + ] + if non_matching_vars: + raise ValueError( + f"when setting `region` explicitly in to_zarr(), all " + f"variables in the dataset to write must have at least " + f"one dimension in common with the region's dimensions " + f"{list(region.keys())}, but that is not " + f"the case for some variables here. To drop these variables " + f"from this dataset before exporting to zarr, write: " + f".drop_vars({non_matching_vars!r})" + ) + + self._write_region = region + def open_zarr( store, diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 2e594455874..0af75f404a2 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -845,7 +845,12 @@ def _generate_range(start, end, periods, offset): A generator object """ if start: - start = offset.rollforward(start) + # From pandas GH 56147 / 56832 to account for negative direction and + # range bounds + if offset.n >= 0: + start = offset.rollforward(start) + else: + start = offset.rollback(start) if periods is None and end < start and offset.n >= 0: end = None diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d95cbccd36a..b1cca586992 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -8,6 +8,7 @@ from xarray.core import dtypes, utils from xarray.core.alignment import align, reindex_variables +from xarray.core.coordinates import Coordinates from xarray.core.duck_array_ops import lazy_array_equiv from xarray.core.indexes import Index, PandasIndex from xarray.core.merge import ( @@ -42,6 +43,7 @@ def concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ) -> T_Dataset: ... @@ -56,6 +58,7 @@ def concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ) -> T_DataArray: ... @@ -69,6 +72,7 @@ def concat( fill_value=dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ): """Concatenate xarray objects along a new or existing dimension. @@ -162,6 +166,8 @@ def concat( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. + create_index_for_new_dim : bool, default: True + Whether to create a new ``PandasIndex`` object when the objects being concatenated contain scalar variables named ``dim``. Returns ------- @@ -217,6 +223,25 @@ def concat( x (new_dim) >> ds = xr.Dataset(coords={"x": 0}) + >>> xr.concat([ds, ds], dim="x") + Size: 16B + Dimensions: (x: 2) + Coordinates: + * x (x) int64 16B 0 0 + Data variables: + *empty* + + >>> xr.concat([ds, ds], dim="x").indexes + Indexes: + x Index([0, 0], dtype='int64', name='x') + + >>> xr.concat([ds, ds], dim="x", create_index_for_new_dim=False).indexes + Indexes: + *empty* """ # TODO: add ignore_index arguments copied from pandas.concat # TODO: support concatenating scalar coordinates even if the concatenated @@ -245,6 +270,7 @@ def concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_index_for_new_dim=create_index_for_new_dim, ) elif isinstance(first_obj, Dataset): return _dataset_concat( @@ -257,6 +283,7 @@ def concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_index_for_new_dim=create_index_for_new_dim, ) else: raise TypeError( @@ -439,7 +466,7 @@ def _parse_datasets( if dim in dims: continue - if dim not in dim_coords: + if dim in ds.coords and dim not in dim_coords: dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) @@ -456,6 +483,7 @@ def _dataset_concat( fill_value: Any = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ) -> T_Dataset: """ Concatenate a sequence of datasets along a new or existing dimension @@ -489,7 +517,6 @@ def _dataset_concat( datasets ) dim_names = set(dim_coords) - unlabeled_dims = dim_names - coord_names both_data_and_coords = coord_names & data_names if both_data_and_coords: @@ -502,7 +529,10 @@ def _dataset_concat( # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: - datasets = [ds.expand_dims(dim) for ds in datasets] + datasets = [ + ds.expand_dims(dim, create_index_for_new_dim=create_index_for_new_dim) + for ds in datasets + ] # determine which variables to concatenate concat_over, equals, concat_dim_lengths = _calc_concat_over( @@ -510,7 +540,7 @@ def _dataset_concat( ) # determine which variables to merge, and then merge them according to compat - variables_to_merge = (coord_names | data_names) - concat_over - unlabeled_dims + variables_to_merge = (coord_names | data_names) - concat_over result_vars = {} result_indexes = {} @@ -567,7 +597,8 @@ def get_indexes(name): var = ds._variables[name] if not var.dims: data = var.set_dims(dim).values - yield PandasIndex(data, dim, coord_dtype=var.dtype) + if create_index_for_new_dim: + yield PandasIndex(data, dim, coord_dtype=var.dtype) # create concatenation index, needed for later reindexing file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths)) @@ -646,29 +677,33 @@ def get_indexes(name): # preserves original variable order result_vars[name] = result_vars.pop(name) - result = type(datasets[0])(result_vars, attrs=result_attrs) - - absent_coord_names = coord_names - set(result.variables) + absent_coord_names = coord_names - set(result_vars) if absent_coord_names: raise ValueError( f"Variables {absent_coord_names!r} are coordinates in some datasets but not others." ) - result = result.set_coords(coord_names) - result.encoding = result_encoding - result = result.drop_vars(unlabeled_dims, errors="ignore") + result_data_vars = {} + coord_vars = {} + for name, result_var in result_vars.items(): + if name in coord_names: + coord_vars[name] = result_var + else: + result_data_vars[name] = result_var if index is not None: - # add concat index / coordinate last to ensure that its in the final Dataset if dim_var is not None: index_vars = index.create_variables({dim: dim_var}) else: index_vars = index.create_variables() - result[dim] = index_vars[dim] + + coord_vars[dim] = index_vars[dim] result_indexes[dim] = index - # TODO: add indexes at Dataset creation (when it is supported) - result = result._overwrite_indexes(result_indexes) + coords_obj = Coordinates(coord_vars, indexes=result_indexes) + + result = type(datasets[0])(result_data_vars, coords=coords_obj, attrs=result_attrs) + result.encoding = result_encoding return result @@ -683,6 +718,7 @@ def _dataarray_concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ) -> T_DataArray: from xarray.core.dataarray import DataArray @@ -719,6 +755,7 @@ def _dataarray_concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_index_for_new_dim=create_index_for_new_dim, ) merged_attrs = merge_attrs([da.attrs for da in arrays], combine_attrs) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c89dedf1215..4dc897c1878 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2558,7 +2558,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_index: bool = True, + create_index_for_new_dim: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -2569,7 +2569,7 @@ def expand_dims( coordinate consisting of a single value. The automatic creation of indexes to back new 1D coordinate variables - controlled by the create_index kwarg. + controlled by the create_index_for_new_dim kwarg. Parameters ---------- @@ -2586,8 +2586,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_index : bool, default is True - Whether to create new PandasIndex objects for any new 1D coordinate variables. + create_index_for_new_dim : bool, default: True + Whether to create new ``PandasIndex`` objects when the object being expanded contains scalar variables with names in ``dim``. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -2651,7 +2651,9 @@ def expand_dims( dim = {dim: 1} dim = either_dict_or_kwargs(dim, dim_kwargs, "expand_dims") - ds = self._to_temp_dataset().expand_dims(dim, axis, create_index=create_index) + ds = self._to_temp_dataset().expand_dims( + dim, axis, create_index_for_new_dim=create_index_for_new_dim + ) return self._from_temp_dataset(ds) def set_index( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2ddcacd2fa0..09597670573 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4513,7 +4513,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_index: bool = True, + create_index_for_new_dim: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -4524,7 +4524,7 @@ def expand_dims( coordinate consisting of a single value. The automatic creation of indexes to back new 1D coordinate variables - controlled by the create_index kwarg. + controlled by the create_index_for_new_dim kwarg. Parameters ---------- @@ -4541,8 +4541,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_index : bool, default is True - Whether to create new PandasIndex objects for any new 1D coordinate variables. + create_index_for_new_dim : bool, default: True + Whether to create new ``PandasIndex`` objects when the object being expanded contains scalar variables with names in ``dim``. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -4612,6 +4612,33 @@ def expand_dims( Data variables: temperature (y, x, time) float64 96B 0.5488 0.7152 0.6028 ... 0.7917 0.5289 + # Expand a scalar variable along a new dimension of the same name with and without creating a new index + + >>> ds = xr.Dataset(coords={"x": 0}) + >>> ds + Size: 8B + Dimensions: () + Coordinates: + x int64 8B 0 + Data variables: + *empty* + + >>> ds.expand_dims("x") + Size: 8B + Dimensions: (x: 1) + Coordinates: + * x (x) int64 8B 0 + Data variables: + *empty* + + >>> ds.expand_dims("x").indexes + Indexes: + x Index([0], dtype='int64', name='x') + + >>> ds.expand_dims("x", create_index_for_new_dim=False).indexes + Indexes: + *empty* + See Also -------- DataArray.expand_dims @@ -4663,7 +4690,7 @@ def expand_dims( # value within the dim dict to the length of the iterable # for later use. - if create_index: + if create_index_for_new_dim: index = PandasIndex(v, k) indexes[k] = index name_and_new_1d_var = index.create_variables() @@ -4705,14 +4732,14 @@ def expand_dims( variables[k] = v.set_dims(dict(all_dims)) else: if k not in variables: - if k in coord_names and create_index: + if k in coord_names and create_index_for_new_dim: # If dims includes a label of a non-dimension coordinate, # it will be promoted to a 1D coordinate with a single value. index, index_vars = create_default_index_implicit(v.set_dims(k)) indexes[k] = index variables.update(index_vars) else: - if create_index: + if create_index_for_new_dim: warnings.warn( f"No index created for dimension {k} because variable {k} is not a coordinate. " f"To create an index for {k}, please first call `.set_coords('{k}')` on this object.", @@ -5400,7 +5427,7 @@ def to_stacked_array( [3, 4, 5, 7]]) Coordinates: * z (z) object 32B MultiIndex - * variable (z) object 32B 'a' 'a' 'a' 'b' + * variable (z) 0: - # can't use fastpath (yet) for scalars - return cast("T_DuckArray", _maybe_wrap_data(data)) + if fastpath and getattr(data, "ndim", None) is not None: + return cast("T_DuckArray", data) from xarray.core.dataarray import DataArray diff --git a/xarray/datatree_/datatree/__init__.py b/xarray/datatree_/datatree/__init__.py index 3159d612913..51c5f1b3073 100644 --- a/xarray/datatree_/datatree/__init__.py +++ b/xarray/datatree_/datatree/__init__.py @@ -1,7 +1,6 @@ # import public API from xarray.core.treenode import InvalidTreeError, NotFoundInTreeError - __all__ = ( "InvalidTreeError", "NotFoundInTreeError", diff --git a/xarray/datatree_/datatree/common.py b/xarray/datatree_/datatree/common.py index e4d52925ede..f4f74337c50 100644 --- a/xarray/datatree_/datatree/common.py +++ b/xarray/datatree_/datatree/common.py @@ -6,8 +6,9 @@ """ import warnings +from collections.abc import Hashable, Iterable, Mapping from contextlib import suppress -from typing import Any, Hashable, Iterable, List, Mapping +from typing import Any class TreeAttrAccessMixin: @@ -83,16 +84,14 @@ def __setattr__(self, name: str, value: Any) -> None: except AttributeError as e: # Don't accidentally shadow custom AttributeErrors, e.g. # DataArray.dims.setter - if str(e) != "{!r} object has no attribute {!r}".format( - type(self).__name__, name - ): + if str(e) != f"{type(self).__name__!r} object has no attribute {name!r}": raise raise AttributeError( f"cannot set attribute {name!r} on a {type(self).__name__!r} object. Use __setitem__ style" "assignment (e.g., `ds['name'] = ...`) instead of assigning variables." ) from e - def __dir__(self) -> List[str]: + def __dir__(self) -> list[str]: """Provide method name lookup and completion. Only provide 'public' methods. """ diff --git a/xarray/testing/assertions.py b/xarray/testing/assertions.py index 018874c169e..69885868f83 100644 --- a/xarray/testing/assertions.py +++ b/xarray/testing/assertions.py @@ -95,6 +95,18 @@ def assert_isomorphic(a: DataTree, b: DataTree, from_root: bool = False): raise TypeError(f"{type(a)} not of type DataTree") +def maybe_transpose_dims(a, b, check_dim_order: bool): + """Helper for assert_equal/allclose/identical""" + __tracebackhide__ = True + if not isinstance(a, (Variable, DataArray, Dataset)): + return b + if not check_dim_order and set(a.dims) == set(b.dims): + # Ensure transpose won't fail if a dimension is missing + # If this is the case, the difference will be caught by the caller + return b.transpose(*a.dims) + return b + + @overload def assert_equal(a, b): ... @@ -104,7 +116,7 @@ def assert_equal(a: DataTree, b: DataTree, from_root: bool = True): ... @ensure_warnings -def assert_equal(a, b, from_root=True): +def assert_equal(a, b, from_root=True, check_dim_order: bool = True): """Like :py:func:`numpy.testing.assert_array_equal`, but for xarray objects. @@ -127,6 +139,8 @@ def assert_equal(a, b, from_root=True): Only used when comparing DataTree objects. Indicates whether or not to first traverse to the root of the trees before checking for isomorphism. If a & b have no parents then this has no effect. + check_dim_order : bool, optional, default is True + Whether dimensions must be in the same order. See Also -------- @@ -137,6 +151,7 @@ def assert_equal(a, b, from_root=True): assert ( type(a) == type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) ) + b = maybe_transpose_dims(a, b, check_dim_order) if isinstance(a, (Variable, DataArray)): assert a.equals(b), formatting.diff_array_repr(a, b, "equals") elif isinstance(a, Dataset): @@ -182,6 +197,8 @@ def assert_identical(a, b, from_root=True): Only used when comparing DataTree objects. Indicates whether or not to first traverse to the root of the trees before checking for isomorphism. If a & b have no parents then this has no effect. + check_dim_order : bool, optional, default is True + Whether dimensions must be in the same order. See Also -------- @@ -213,7 +230,9 @@ def assert_identical(a, b, from_root=True): @ensure_warnings -def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): +def assert_allclose( + a, b, rtol=1e-05, atol=1e-08, decode_bytes=True, check_dim_order: bool = True +): """Like :py:func:`numpy.testing.assert_allclose`, but for xarray objects. Raises an AssertionError if two objects are not equal up to desired @@ -233,6 +252,8 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): Whether byte dtypes should be decoded to strings as UTF-8 or not. This is useful for testing serialization methods on Python 3 that return saved strings as bytes. + check_dim_order : bool, optional, default is True + Whether dimensions must be in the same order. See Also -------- @@ -240,16 +261,16 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): """ __tracebackhide__ = True assert type(a) == type(b) + b = maybe_transpose_dims(a, b, check_dim_order) equiv = functools.partial( _data_allclose_or_equiv, rtol=rtol, atol=atol, decode_bytes=decode_bytes ) - equiv.__name__ = "allclose" + equiv.__name__ = "allclose" # type: ignore[attr-defined] def compat_variable(a, b): a = getattr(a, "variable", a) b = getattr(b, "variable", b) - return a.dims == b.dims and (a._data is b._data or equiv(a.data, b.data)) if isinstance(a, Variable): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 5d4ca78c049..64a879369f8 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -16,10 +16,8 @@ import xarray.testing from xarray import Dataset -from xarray.core import utils from xarray.core.duck_array_ops import allclose_or_equiv # noqa: F401 from xarray.core.extension_array import PandasExtensionArray -from xarray.core.indexing import ExplicitlyIndexed from xarray.core.options import set_options from xarray.core.variable import IndexVariable from xarray.testing import ( # noqa: F401 @@ -27,6 +25,13 @@ assert_duckarray_allclose, assert_duckarray_equal, ) +from xarray.tests.arrays import ( # noqa: F401 + ConcatenatableArray, + DuckArrayWrapper, + FirstElementAccessibleArray, + InaccessibleArray, + UnexpectedDataAccess, +) # import mpl and change the backend before other mpl imports try: @@ -130,6 +135,7 @@ def _importorskip( has_numexpr, requires_numexpr = _importorskip("numexpr") has_flox, requires_flox = _importorskip("flox") has_pandas_ge_2_2, __ = _importorskip("pandas", "2.2") +has_pandas_3, requires_pandas_3 = _importorskip("pandas", "3.0.0.dev0") # some special cases @@ -215,51 +221,6 @@ def raise_if_dask_computes(max_computes=0): network = pytest.mark.network -class UnexpectedDataAccess(Exception): - pass - - -class InaccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed): - """Disallows any loading.""" - - def __init__(self, array): - self.array = array - - def get_duck_array(self): - raise UnexpectedDataAccess("Tried accessing data") - - def __array__(self, dtype: np.typing.DTypeLike = None): - raise UnexpectedDataAccess("Tried accessing data") - - def __getitem__(self, key): - raise UnexpectedDataAccess("Tried accessing data.") - - -class FirstElementAccessibleArray(InaccessibleArray): - def __getitem__(self, key): - tuple_idxr = key.tuple - if len(tuple_idxr) > 1: - raise UnexpectedDataAccess("Tried accessing more than one element.") - return self.array[tuple_idxr] - - -class DuckArrayWrapper(utils.NDArrayMixin): - """Array-like that prevents casting to array. - Modeled after cupy.""" - - def __init__(self, array: np.ndarray): - self.array = array - - def __getitem__(self, key): - return type(self)(self.array[key]) - - def __array__(self, dtype: np.typing.DTypeLike = None): - raise UnexpectedDataAccess("Tried accessing data") - - def __array_namespace__(self): - """Present to satisfy is_duck_array test.""" - - class ReturnItem: def __getitem__(self, key): return key diff --git a/xarray/tests/arrays.py b/xarray/tests/arrays.py new file mode 100644 index 00000000000..983e620d1f0 --- /dev/null +++ b/xarray/tests/arrays.py @@ -0,0 +1,179 @@ +from collections.abc import Iterable +from typing import Any, Callable + +import numpy as np + +from xarray.core import utils +from xarray.core.indexing import ExplicitlyIndexed + +""" +This module contains various lazy array classes which can be wrapped and manipulated by xarray objects but will raise on data access. +""" + + +class UnexpectedDataAccess(Exception): + pass + + +class InaccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed): + """Disallows any loading.""" + + def __init__(self, array): + self.array = array + + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __getitem__(self, key): + raise UnexpectedDataAccess("Tried accessing data.") + + +class FirstElementAccessibleArray(InaccessibleArray): + def __getitem__(self, key): + tuple_idxr = key.tuple + if len(tuple_idxr) > 1: + raise UnexpectedDataAccess("Tried accessing more than one element.") + return self.array[tuple_idxr] + + +class DuckArrayWrapper(utils.NDArrayMixin): + """Array-like that prevents casting to array. + Modeled after cupy.""" + + def __init__(self, array: np.ndarray): + self.array = array + + def __getitem__(self, key): + return type(self)(self.array[key]) + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __array_namespace__(self): + """Present to satisfy is_duck_array test.""" + + +CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS: dict[str, Callable] = {} + + +def implements(numpy_function): + """Register an __array_function__ implementation for ConcatenatableArray objects.""" + + def decorator(func): + CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS[numpy_function] = func + return func + + return decorator + + +@implements(np.concatenate) +def concatenate( + arrays: Iterable["ConcatenatableArray"], /, *, axis=0 +) -> "ConcatenatableArray": + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.concatenate([arr._array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.stack) +def stack( + arrays: Iterable["ConcatenatableArray"], /, *, axis=0 +) -> "ConcatenatableArray": + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.stack([arr._array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.result_type) +def result_type(*arrays_and_dtypes) -> np.dtype: + """Called by xarray to ensure all arguments to concat have the same dtype.""" + first_dtype, *other_dtypes = (np.dtype(obj) for obj in arrays_and_dtypes) + for other_dtype in other_dtypes: + if other_dtype != first_dtype: + raise ValueError("dtypes not all consistent") + return first_dtype + + +@implements(np.broadcast_to) +def broadcast_to( + x: "ConcatenatableArray", /, shape: tuple[int, ...] +) -> "ConcatenatableArray": + """ + Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries. + """ + if not isinstance(x, ConcatenatableArray): + raise TypeError + + result = np.broadcast_to(x._array, shape=shape) + return ConcatenatableArray(result) + + +class ConcatenatableArray: + """Disallows loading or coercing to an index but does support concatenation / stacking.""" + + def __init__(self, array): + # use ._array instead of .array because we don't want this to be accessible even to xarray's internals (e.g. create_default_index_implicit) + self._array = array + + @property + def dtype(self: Any) -> np.dtype: + return self._array.dtype + + @property + def shape(self: Any) -> tuple[int, ...]: + return self._array.shape + + @property + def ndim(self: Any) -> int: + return self._array.ndim + + def __repr__(self: Any) -> str: + return f"{type(self).__name__}(array={self._array!r})" + + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __getitem__(self, key) -> "ConcatenatableArray": + """Some cases of concat require supporting expanding dims by dimensions of size 1""" + # see https://data-apis.org/array-api/2022.12/API_specification/indexing.html#multi-axis-indexing + arr = self._array + for axis, indexer_1d in enumerate(key): + if indexer_1d is None: + arr = np.expand_dims(arr, axis) + elif indexer_1d is Ellipsis: + pass + else: + raise UnexpectedDataAccess("Tried accessing data.") + return ConcatenatableArray(arr) + + def __array_function__(self, func, types, args, kwargs) -> Any: + if func not in CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS: + return NotImplemented + + # Note: this allows subclasses that don't override + # __array_function__ to handle ManifestArray objects + if not all(issubclass(t, ConcatenatableArray) for t in types): + return NotImplemented + + return CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS[func](*args, **kwargs) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: + """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs.""" + return NotImplemented + + def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ConcatenatableArray": + """Needed because xarray will call this even when it's a no-op""" + if dtype != self.dtype: + raise NotImplementedError() + else: + return self diff --git a/xarray/tests/test_assertions.py b/xarray/tests/test_assertions.py index f7e49a0f3de..aa0ea46f7db 100644 --- a/xarray/tests/test_assertions.py +++ b/xarray/tests/test_assertions.py @@ -57,6 +57,25 @@ def test_allclose_regression() -> None: def test_assert_allclose(obj1, obj2) -> None: with pytest.raises(AssertionError): xr.testing.assert_allclose(obj1, obj2) + with pytest.raises(AssertionError): + xr.testing.assert_allclose(obj1, obj2, check_dim_order=False) + + +@pytest.mark.parametrize("func", ["assert_equal", "assert_allclose"]) +def test_assert_allclose_equal_transpose(func) -> None: + """Transposed DataArray raises assertion unless check_dim_order=False.""" + obj1 = xr.DataArray([[0, 1, 2], [2, 3, 4]], dims=["a", "b"]) + obj2 = xr.DataArray([[0, 2], [1, 3], [2, 4]], dims=["b", "a"]) + with pytest.raises(AssertionError): + getattr(xr.testing, func)(obj1, obj2) + getattr(xr.testing, func)(obj1, obj2, check_dim_order=False) + ds1 = obj1.to_dataset(name="varname") + ds1["var2"] = obj1 + ds2 = obj1.to_dataset(name="varname") + ds2["var2"] = obj1.transpose() + with pytest.raises(AssertionError): + getattr(xr.testing, func)(ds1, ds2) + getattr(xr.testing, func)(ds1, ds2, check_dim_order=False) @pytest.mark.filterwarnings("error") diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 0110afe40ac..eabb7d2f4d6 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -42,6 +42,7 @@ has_cftime, has_pandas_ge_2_2, requires_cftime, + requires_pandas_3, ) cftime = pytest.importorskip("cftime") @@ -1354,7 +1355,7 @@ def test_calendar_specific_month_end_negative_freq( ) -> None: year = 2000 # Use a leap-year to highlight calendar differences result = cftime_range( - start="2000-12", + start="2001", end="2000", freq="-2ME", calendar=calendar, @@ -1464,7 +1465,7 @@ def test_date_range_errors() -> None: ("2020-02-01", "QE-DEC", "noleap", "gregorian", True, "2020-03-31", True), ("2020-02-01", "YS-FEB", "noleap", "gregorian", True, "2020-02-01", True), ("2020-02-01", "YE-FEB", "noleap", "gregorian", True, "2020-02-29", True), - ("2020-02-01", "-1YE-FEB", "noleap", "gregorian", True, "2020-02-29", True), + ("2020-02-01", "-1YE-FEB", "noleap", "gregorian", True, "2019-02-28", True), ("2020-02-28", "3h", "all_leap", "gregorian", False, "2020-02-28", True), ("2020-03-30", "ME", "360_day", "gregorian", False, "2020-03-31", True), ("2020-03-31", "ME", "gregorian", "360_day", None, "2020-03-30", False), @@ -1724,7 +1725,17 @@ def test_new_to_legacy_freq_pd_freq_passthrough(freq, expected): @pytest.mark.parametrize("start", ("2000", "2001")) @pytest.mark.parametrize("end", ("2000", "2001")) @pytest.mark.parametrize( - "freq", ("MS", "-1MS", "YS", "-1YS", "ME", "-1ME", "YE", "-1YE") + "freq", + ( + "MS", + pytest.param("-1MS", marks=requires_pandas_3), + "YS", + pytest.param("-1YS", marks=requires_pandas_3), + "ME", + pytest.param("-1ME", marks=requires_pandas_3), + "YE", + pytest.param("-1YE", marks=requires_pandas_3), + ), ) def test_cftime_range_same_as_pandas(start, end, freq): result = date_range(start, end, freq=freq, calendar="standard", use_cftime=True) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 1ddb5a569bd..0c570de3b52 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -12,7 +12,9 @@ from xarray.core.coordinates import Coordinates from xarray.core.indexes import PandasIndex from xarray.tests import ( + ConcatenatableArray, InaccessibleArray, + UnexpectedDataAccess, assert_array_equal, assert_equal, assert_identical, @@ -999,6 +1001,63 @@ def test_concat_str_dtype(self, dtype, dim) -> None: assert np.issubdtype(actual.x2.dtype, dtype) + def test_concat_avoids_index_auto_creation(self) -> None: + # TODO once passing indexes={} directly to Dataset constructor is allowed then no need to create coords first + coords = Coordinates( + {"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} + ) + datasets = [ + Dataset( + {"a": (["x", "y"], ConcatenatableArray(np.zeros((3, 3))))}, + coords=coords, + ) + for _ in range(2) + ] + # should not raise on concat + combined = concat(datasets, dim="x") + assert combined["a"].shape == (6, 3) + assert combined["a"].dims == ("x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + + # should not raise on stack + combined = concat(datasets, dim="z") + assert combined["a"].shape == (2, 3, 3) + assert combined["a"].dims == ("z", "x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + + def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: + # create 0D coordinates (without indexes) + datasets = [ + Dataset( + coords={"x": ConcatenatableArray(np.array(10))}, + ) + for _ in range(2) + ] + + with pytest.raises(UnexpectedDataAccess): + concat(datasets, dim="x", create_index_for_new_dim=True) + + # should not raise on concat iff create_index_for_new_dim=False + combined = concat(datasets, dim="x", create_index_for_new_dim=False) + assert combined["x"].shape == (2,) + assert combined["x"].dims == ("x",) + + # nor have auto-created any indexes + assert combined.indexes == {} + + def test_concat_promote_shape_without_creating_new_index(self) -> None: + # different shapes but neither have indexes + ds1 = Dataset(coords={"x": 0}) + ds2 = Dataset(data_vars={"x": [1]}).drop_indexes("x") + actual = concat([ds1, ds2], dim="x", create_index_for_new_dim=False) + expected = Dataset(data_vars={"x": [0, 1]}).drop_indexes("x") + assert_identical(actual, expected, check_default_indexes=False) + assert actual.indexes == {} + class TestConcatDataArray: def test_concat(self) -> None: @@ -1072,6 +1131,35 @@ def test_concat_lazy(self) -> None: assert combined.shape == (2, 3, 3) assert combined.dims == ("z", "x", "y") + def test_concat_avoids_index_auto_creation(self) -> None: + # TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first + coords = Coordinates( + {"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} + ) + arrays = [ + DataArray( + ConcatenatableArray(np.zeros((3, 3))), + dims=["x", "y"], + coords=coords, + ) + for _ in range(2) + ] + # should not raise on concat + combined = concat(arrays, dim="x") + assert combined.shape == (6, 3) + assert combined.dims == ("x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + + # should not raise on stack + combined = concat(arrays, dim="z") + assert combined.shape == (2, 3, 3) + assert combined.dims == ("z", "x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_concat_fill_value(self, fill_value) -> None: foo = DataArray([1, 2], coords=[("x", [1, 2])]) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 75ae4d67574..ecca8c0c79e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -284,7 +284,7 @@ def test_repr(self) -> None: Dimensions: (dim2: 9, dim3: 10, time: 20, dim1: 8) Coordinates: * dim2 (dim2) float64 72B 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 - * dim3 (dim3) %s 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' + * dim3 (dim3) {} 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' * time (time) datetime64[ns] 160B 2000-01-01 2000-01-02 ... 2000-01-20 numbers (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3 Dimensions without coordinates: dim1 @@ -293,8 +293,9 @@ def test_repr(self) -> None: var2 (dim1, dim2) float64 576B 1.162 -1.097 -2.123 ... 1.267 0.3328 var3 (dim3, dim1) float64 640B 0.5565 -0.2121 0.4563 ... -0.2452 -0.3616 Attributes: - foo: bar""" - % data["dim3"].dtype + foo: bar""".format( + data["dim3"].dtype + ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) print(actual) @@ -3430,16 +3431,22 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) - @pytest.mark.parametrize("create_index_flag", [True, False]) - def test_expand_dims_create_index_data_variable(self, create_index_flag): + @pytest.mark.parametrize("create_index_for_new_dim_flag", [True, False]) + def test_expand_dims_create_index_data_variable( + self, create_index_for_new_dim_flag + ): # data variables should not gain an index ever ds = Dataset({"x": 0}) - if create_index_flag: + if create_index_for_new_dim_flag: with pytest.warns(UserWarning, match="No index created"): - expanded = ds.expand_dims("x", create_index=create_index_flag) + expanded = ds.expand_dims( + "x", create_index_for_new_dim=create_index_for_new_dim_flag + ) else: - expanded = ds.expand_dims("x", create_index=create_index_flag) + expanded = ds.expand_dims( + "x", create_index_for_new_dim=create_index_for_new_dim_flag + ) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") @@ -3448,13 +3455,13 @@ def test_expand_dims_create_index_data_variable(self, create_index_flag): assert expanded.indexes == {} def test_expand_dims_create_index_coordinate_variable(self): - # coordinate variables should gain an index only if create_index is True (the default) + # coordinate variables should gain an index only if create_index_for_new_dim is True (the default) ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims("x") expected = Dataset({"x": ("x", [0])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims("x", create_index=False) + expanded_no_index = ds.expand_dims("x", create_index_for_new_dim=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") @@ -3468,7 +3475,7 @@ def test_expand_dims_create_index_from_iterable(self): expected = Dataset({"x": ("x", [0, 1])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims(x=[0, 1], create_index=False) + expanded_no_index = ds.expand_dims(x=[0, 1], create_index_for_new_dim=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0, 1])}).drop_indexes("x") @@ -3970,6 +3977,25 @@ def test_to_stacked_array_to_unstacked_dataset_different_dimension(self) -> None x = y.to_unstacked_dataset("features") assert_identical(D, x) + def test_to_stacked_array_preserves_dtype(self) -> None: + # regression test for bug found in https://github.com/pydata/xarray/pull/8872#issuecomment-2081218616 + ds = xr.Dataset( + data_vars={ + "a": (("x", "y"), [[0, 1, 2], [3, 4, 5]]), + "b": ("x", [6, 7]), + }, + coords={"y": ["u", "v", "w"]}, + ) + stacked = ds.to_stacked_array("z", sample_dims=["x"]) + + # coordinate created from variables names should be of string dtype + data = np.array(["a", "a", "a", "b"], dtype=" None: data = create_test_data(seed=0) expected = data.copy() diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index e9e4eb1364c..7134fe96d01 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -577,8 +577,8 @@ def test_da_groupby_assign_coords() -> None: def test_groupby_repr(obj, dim) -> None: actual = repr(obj.groupby(dim)) expected = f"{obj.__class__.__name__}GroupBy" - expected += ", grouped over %r" % dim - expected += "\n%r groups with labels " % (len(np.unique(obj[dim]))) + expected += f", grouped over {dim!r}" + expected += f"\n{len(np.unique(obj[dim]))!r} groups with labels " if dim == "x": expected += "1, 2, 3, 4, 5." elif dim == "y": @@ -595,7 +595,7 @@ def test_groupby_repr_datetime(obj) -> None: actual = repr(obj.groupby("t.month")) expected = f"{obj.__class__.__name__}GroupBy" expected += ", grouped over 'month'" - expected += "\n%r groups with labels " % (len(np.unique(obj.t.dt.month))) + expected += f"\n{len(np.unique(obj.t.dt.month))!r} groups with labels " expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12." assert actual == expected diff --git a/xarray/tests/test_rolling.py b/xarray/tests/test_rolling.py index 79a5ba0a667..89f6ebba2c3 100644 --- a/xarray/tests/test_rolling.py +++ b/xarray/tests/test_rolling.py @@ -254,7 +254,7 @@ def test_rolling_reduce( rolling_obj = da.rolling(time=window, center=center, min_periods=min_periods) # add nan prefix to numpy methods to get similar # behavior as bottleneck - actual = rolling_obj.reduce(getattr(np, "nan%s" % name)) + actual = rolling_obj.reduce(getattr(np, f"nan{name}")) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) assert actual.sizes == expected.sizes @@ -276,7 +276,7 @@ def test_rolling_reduce_nonnumeric( rolling_obj = da.rolling(time=window, center=center, min_periods=min_periods) # add nan prefix to numpy methods to get similar behavior as bottleneck - actual = rolling_obj.reduce(getattr(np, "nan%s" % name)) + actual = rolling_obj.reduce(getattr(np, f"nan{name}")) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) assert actual.sizes == expected.sizes @@ -741,7 +741,7 @@ def test_rolling_reduce(self, ds, center, min_periods, window, name) -> None: rolling_obj = ds.rolling(time=window, center=center, min_periods=min_periods) # add nan prefix to numpy methods to get similar behavior as bottleneck - actual = rolling_obj.reduce(getattr(np, "nan%s" % name)) + actual = rolling_obj.reduce(getattr(np, f"nan{name}")) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) assert ds.sizes == actual.sizes