From 4011cf75060fafa86bde05e97df589d7089e8bf3 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 31 Dec 2018 15:21:13 -0800 Subject: [PATCH 1/3] ENH: switch Dataset and DataArray to use explicit indexes This change switches Dataset.indexes and DataArray.indexes to be backed by explicit dictionaries of indexes, instead of being implicitly defined by the set of coordinates with names matching dimensions. There are no changes to the public interface yet: these will come later. For now, indexes are recreated from coordinates every time a new DataArray or Dataset is created. In follow-up PRs, I will refactor indexes to be propagated explicitly in xarray operations. This will facilitate future API changes, when indexes will no longer only be associated with dimensions. --- xarray/core/coordinates.py | 40 ++------------------------------------ xarray/core/dataarray.py | 15 ++++++++++---- xarray/core/dataset.py | 25 +++++++++++++++++------- 3 files changed, 31 insertions(+), 49 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index efe8affb2a3..1840b20e13c 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -6,6 +6,7 @@ import pandas as pd from . import formatting, indexing +from .indexes import default_indexes from .merge import ( expand_and_merge_variables, merge_coords, merge_coords_for_inplace_math) from .pycompat import OrderedDict @@ -196,6 +197,7 @@ def _update_coords(self, coords): self._data._variables = variables self._data._coord_names.update(new_coord_names) self._data._dims = dict(dims) + self._data._indexes = default_indexes(variables, dims) def __delitem__(self, key): if key in self: @@ -276,44 +278,6 @@ def __iter__(self): return iter(self._data._level_coords) -class Indexes(Mapping, formatting.ReprMixin): - """Ordered Mapping[str, pandas.Index] for xarray objects. - """ - - def __init__(self, variables, sizes): - """Not for public consumption. - - Parameters - ---------- - variables : OrderedDict[Any, Variable] - Reference to OrderedDict holding variable objects. Should be the - same dictionary used by the source object. - sizes : OrderedDict[Any, int] - Map from dimension names to sizes. - """ - self._variables = variables - self._sizes = sizes - - def __iter__(self): - for key in self._sizes: - if key in self._variables: - yield key - - def __len__(self): - return sum(key in self._variables for key in self._sizes) - - def __contains__(self, key): - return key in self._sizes and key in self._variables - - def __getitem__(self, key): - if key not in self._sizes: - raise KeyError(key) - return self._variables[key].to_index() - - def __unicode__(self): - return formatting.indexes_repr(self) - - def assert_coordinate_consistent(obj, coords): """ Maeke sure the dimension coordinate of obj is consistent with coords. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 25a66e529ae..0aa8ee2ec69 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -13,10 +13,11 @@ from .alignment import align, reindex_like_indexers from .common import AbstractArray, DataWithCoords from .coordinates import ( - DataArrayCoordinates, Indexes, LevelCoordinatesSource, + DataArrayCoordinates, LevelCoordinatesSource, assert_coordinate_consistent, remap_label_indexers) from .dataset import Dataset, merge_indexes, split_indexes from .formatting import format_item +from .indexes import default_indexes, Indexes from .options import OPTIONS from .pycompat import OrderedDict, basestring, iteritems, range, zip from .utils import ( @@ -165,7 +166,7 @@ class DataArray(AbstractArray, DataWithCoords): dt = property(DatetimeAccessor) def __init__(self, data, coords=None, dims=None, name=None, - attrs=None, encoding=None, fastpath=False): + attrs=None, encoding=None, indexes=None, fastpath=False): """ Parameters ---------- @@ -237,6 +238,12 @@ def __init__(self, data, coords=None, dims=None, name=None, self._coords = coords self._name = name + # TODO(shoyer): document this argument, once it becomes part of the + # public interface. + if indexes is None: + indexes = default_indexes(coords, variable.dims) + self._indexes = indexes + self._file_obj = None self._initialized = True @@ -534,9 +541,9 @@ def encoding(self, value): @property def indexes(self): - """OrderedDict of pandas.Index objects used for label based indexing + """Mapping of pandas.Index objects used for label based indexing """ - return Indexes(self._coords, self.sizes) + return Indexes(self._indexes) @property def coords(self): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 62c6e98c954..e4b4e8f59b2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -13,16 +13,17 @@ import xarray as xr from . import ( - alignment, dtypes, duck_array_ops, formatting, groupby, indexing, ops, - pdcompat, resample, rolling, utils) + alignment, dtypes, duck_array_ops, formatting, groupby, + indexing, ops, pdcompat, resample, rolling, utils) from ..coding.cftimeindex import _parse_array_of_cftime_strings from .alignment import align from .common import ( ALL_DIMS, DataWithCoords, ImplementsDatasetReduce, _contains_datetime_like_objects) from .coordinates import ( - DatasetCoordinates, Indexes, LevelCoordinatesSource, + DatasetCoordinates, LevelCoordinatesSource, assert_coordinate_consistent, remap_label_indexers) +from .indexes import Indexes, default_indexes from .merge import ( dataset_merge_method, dataset_update_method, merge_data_and_coords, merge_variables) @@ -364,6 +365,10 @@ def __init__(self, data_vars=None, coords=None, attrs=None, coords = {} if data_vars is not None or coords is not None: self._set_init_vars_and_dims(data_vars, coords, compat) + + # TODO(shoyer): expose indexes as a public argument in __init__ + self._indexes = default_indexes(self._variables, self._dims) + if attrs is not None: self.attrs = attrs self._encoding = None @@ -642,7 +647,7 @@ def persist(self, **kwargs): @classmethod def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, - file_obj=None, encoding=None): + indexes=None, file_obj=None, encoding=None): """Shortcut around __init__ for internal use when we want to skip costly validation """ @@ -650,6 +655,9 @@ def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, obj._variables = variables obj._coord_names = coord_names obj._dims = dims + if indexes is None: + indexes = default_indexes(variables, dims) + obj._indexes = indexes obj._attrs = attrs obj._file_obj = file_obj obj._encoding = encoding @@ -664,7 +672,8 @@ def _from_vars_and_coord_names(cls, variables, coord_names, attrs=None): return cls._construct_direct(variables, coord_names, dims, attrs) def _replace_vars_and_dims(self, variables, coord_names=None, dims=None, - attrs=__default_attrs, inplace=False): + attrs=__default_attrs, indexes=None, + inplace=False): """Fastpath constructor for internal use. Preserves coord names and attributes. If not provided explicitly, @@ -693,6 +702,8 @@ def _replace_vars_and_dims(self, variables, coord_names=None, dims=None, self._coord_names = coord_names if attrs is not self.__default_attrs: self._attrs = attrs + if indexes is None: + self._indexes = default_indexes(variables, dims) obj = self else: if coord_names is None: @@ -1064,9 +1075,9 @@ def identical(self, other): @property def indexes(self): - """OrderedDict of pandas.Index objects used for label based indexing + """Mapping of pandas.Index objects used for label based indexing """ - return Indexes(self._variables, self._dims) + return Indexes(self._indexes) @property def coords(self): From 2cd49604dbfb824418b22c6d4fb7f5d1b5570a63 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 31 Dec 2018 16:55:00 -0800 Subject: [PATCH 2/3] Add xarray.core.indexes --- xarray/core/indexes.py | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 xarray/core/indexes.py diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py new file mode 100644 index 00000000000..6f105079796 --- /dev/null +++ b/xarray/core/indexes.py @@ -0,0 +1,53 @@ +from __future__ import absolute_import, division, print_function +try: + from collections.abc import Mapping +except ImportError: + from collections import Mapping + +from . import formatting + + +class Indexes(Mapping, formatting.ReprMixin): + """Immutable proxy for Dataset or DataArrary indexes.""" + def __init__(self, indexes): + """Not for public consumption. + + Parameters + ---------- + indexes : Dict[Any, pandas.Index] + Indexes held by this object. + """ + self._indexes = indexes + + def __iter__(self): + return iter(self._indexes) + + def __len__(self): + return len(self._indexes) + + def __contains__(self, key): + return key in self._indexes + + def __getitem__(self, key): + return self._indexes[key] + + def __unicode__(self): + return formatting.indexes_repr(self) + + +def default_indexes(coords, dims): + """Default indexes for a Dataset/DataArray. + + Parameters + ---------- + coords : Mapping[Any, xarray.Variable] + Coordinate variables from which to draw default indexes. + dims : iterable + Iterable of dimension names. + + Returns + ------- + Mapping[Any, pandas.Index] mapping indexing keys (levels/dimension names) + to indexes used for indexing along that dimension. + """ + return {key: coords[key].to_index() for key in dims if key in coords} From 9321e5acc1ff789e502edc77c5162c93708e6995 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 3 Jan 2019 20:18:31 -0800 Subject: [PATCH 3/3] Fixes per review --- xarray/core/coordinates.py | 3 +-- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 12 ++++++------ xarray/core/indexes.py | 4 +++- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 1840b20e13c..820937dae6a 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -6,7 +6,6 @@ import pandas as pd from . import formatting, indexing -from .indexes import default_indexes from .merge import ( expand_and_merge_variables, merge_coords, merge_coords_for_inplace_math) from .pycompat import OrderedDict @@ -197,7 +196,7 @@ def _update_coords(self, coords): self._data._variables = variables self._data._coord_names.update(new_coord_names) self._data._dims = dict(dims) - self._data._indexes = default_indexes(variables, dims) + self._data._indexes = None def __delitem__(self, key): if key in self: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 0aa8ee2ec69..38aa1b42b92 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -240,8 +240,6 @@ def __init__(self, data, coords=None, dims=None, name=None, # TODO(shoyer): document this argument, once it becomes part of the # public interface. - if indexes is None: - indexes = default_indexes(coords, variable.dims) self._indexes = indexes self._file_obj = None @@ -543,6 +541,8 @@ def encoding(self, value): def indexes(self): """Mapping of pandas.Index objects used for label based indexing """ + if self._indexes is None: + self._indexes = default_indexes(self._coords, self.dims) return Indexes(self._indexes) @property diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e4b4e8f59b2..0908e6ecce6 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -367,7 +367,7 @@ def __init__(self, data_vars=None, coords=None, attrs=None, self._set_init_vars_and_dims(data_vars, coords, compat) # TODO(shoyer): expose indexes as a public argument in __init__ - self._indexes = default_indexes(self._variables, self._dims) + self._indexes = None if attrs is not None: self.attrs = attrs @@ -655,8 +655,6 @@ def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, obj._variables = variables obj._coord_names = coord_names obj._dims = dims - if indexes is None: - indexes = default_indexes(variables, dims) obj._indexes = indexes obj._attrs = attrs obj._file_obj = file_obj @@ -702,15 +700,15 @@ def _replace_vars_and_dims(self, variables, coord_names=None, dims=None, self._coord_names = coord_names if attrs is not self.__default_attrs: self._attrs = attrs - if indexes is None: - self._indexes = default_indexes(variables, dims) + self._indexes = indexes obj = self else: if coord_names is None: coord_names = self._coord_names.copy() if attrs is self.__default_attrs: attrs = self._attrs_copy() - obj = self._construct_direct(variables, coord_names, dims, attrs) + obj = self._construct_direct( + variables, coord_names, dims, attrs, indexes) return obj def _replace_indexes(self, indexes): @@ -1077,6 +1075,8 @@ def identical(self, other): def indexes(self): """Mapping of pandas.Index objects used for label based indexing """ + if self._indexes is None: + self._indexes = default_indexes(self._variables, self._dims) return Indexes(self._indexes) @property diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 6f105079796..ffa483fc370 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -3,6 +3,7 @@ from collections.abc import Mapping except ImportError: from collections import Mapping +from collections import OrderedDict from . import formatting @@ -50,4 +51,5 @@ def default_indexes(coords, dims): Mapping[Any, pandas.Index] mapping indexing keys (levels/dimension names) to indexes used for indexing along that dimension. """ - return {key: coords[key].to_index() for key in dims if key in coords} + return OrderedDict((key, coords[key].to_index()) + for key in dims if key in coords)