-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
implement dask
methods on DataTree
#9670
Changes from 12 commits
f625f5d
507fb7d
ce1683a
f2a4683
f0ff30f
d12203c
dda02ed
329c689
c9fb461
0305fc5
e2a3a14
7f57ffa
900701b
d45dbd0
5f88937
b515972
39d95f6
73b4466
8b35676
d70f4f0
3e9745f
b6c5f9a
da8df36
b11a1ef
53c0897
f7e31b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,7 @@ | |
from xarray.core._aggregations import DataTreeAggregations | ||
from xarray.core._typed_ops import DataTreeOpsMixin | ||
from xarray.core.alignment import align | ||
from xarray.core.common import TreeAttrAccessMixin | ||
from xarray.core.common import TreeAttrAccessMixin, get_chunksizes | ||
from xarray.core.coordinates import Coordinates, DataTreeCoordinates | ||
from xarray.core.dataarray import DataArray | ||
from xarray.core.dataset import Dataset, DataVariables | ||
|
@@ -49,6 +49,8 @@ | |
parse_dims_as_set, | ||
) | ||
from xarray.core.variable import Variable | ||
from xarray.namedarray.parallelcompat import get_chunked_array_type | ||
from xarray.namedarray.pycompat import is_chunked_array | ||
|
||
try: | ||
from xarray.core.variable import calculate_dimensions | ||
|
@@ -68,8 +70,11 @@ | |
ErrorOptions, | ||
ErrorOptionsWithWarn, | ||
NetcdfWriteModes, | ||
T_ChunkDimFreq, | ||
T_ChunksFreq, | ||
ZarrWriteModes, | ||
) | ||
from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint | ||
|
||
# """ | ||
# DEVELOPERS' NOTE | ||
|
@@ -862,9 +867,9 @@ def _copy_node( | |
) -> Self: | ||
"""Copy just one node of a tree.""" | ||
new_node = super()._copy_node(inherit=inherit, deep=deep, memo=memo) | ||
data = self._to_dataset_view(rebuild_dims=False, inherit=inherit) | ||
if deep: | ||
data = data._copy(deep=True, memo=memo) | ||
data = self._to_dataset_view(rebuild_dims=False, inherit=inherit)._copy( | ||
deep=deep, memo=memo | ||
) | ||
new_node._set_node_data(data) | ||
return new_node | ||
|
||
|
@@ -1896,3 +1901,190 @@ def apply_indexers(dataset, node_indexers): | |
|
||
indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "sel") | ||
return self._selective_indexing(apply_indexers, indexers) | ||
|
||
def load(self, **kwargs) -> Self: | ||
"""Manually trigger loading and/or computation of this datatree's data | ||
from disk or a remote source into memory and return this datatree. | ||
Unlike compute, the original datatree is modified and returned. | ||
|
||
Normally, it should not be necessary to call this method in user code, | ||
because all xarray functions should either work on deferred data or | ||
load data automatically. However, this method can be necessary when | ||
working with many file objects on disk. | ||
|
||
Parameters | ||
---------- | ||
**kwargs : dict | ||
Additional keyword arguments passed on to ``dask.compute``. | ||
|
||
See Also | ||
-------- | ||
Dataset.load | ||
dask.compute | ||
keewis marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
# access .data to coerce everything to numpy or dask arrays | ||
lazy_data = { | ||
path: { | ||
k: v._data | ||
for k, v in node.variables.items() | ||
if is_chunked_array(v._data) | ||
} | ||
for path, node in self.subtree_with_keys | ||
} | ||
flat_lazy_data = { | ||
(path, var_name): array | ||
for path, node in lazy_data.items() | ||
for var_name, array in node.items() | ||
} | ||
if lazy_data: | ||
chunkmanager = get_chunked_array_type(*flat_lazy_data.values()) | ||
|
||
# evaluate all the chunked arrays simultaneously | ||
evaluated_data: tuple[np.ndarray[Any, Any], ...] = chunkmanager.compute( | ||
*flat_lazy_data.values(), **kwargs | ||
) | ||
|
||
for (path, var_name), data in zip( | ||
flat_lazy_data, evaluated_data, strict=False | ||
): | ||
self[path].variables[var_name].data = data | ||
keewis marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# load everything else sequentially | ||
for node in self.subtree: | ||
for k, v in node.variables.items(): | ||
if k not in lazy_data: | ||
v.load() | ||
|
||
return self | ||
|
||
def compute(self, **kwargs) -> Self: | ||
"""Manually trigger loading and/or computation of this datatree's data | ||
from disk or a remote source into memory and return a new dataset. | ||
Unlike load, the original dataset is left unaltered. | ||
|
||
Normally, it should not be necessary to call this method in user code, | ||
because all xarray functions should either work on deferred data or | ||
load data automatically. However, this method can be necessary when | ||
working with many file objects on disk. | ||
|
||
Parameters | ||
---------- | ||
**kwargs : dict | ||
Additional keyword arguments passed on to ``dask.compute``. | ||
|
||
Returns | ||
------- | ||
object : DataTree | ||
New object with lazy data variables and coordinates as in-memory arrays. | ||
|
||
See Also | ||
-------- | ||
dask.compute | ||
""" | ||
new = self.copy(deep=False) | ||
return new.load(**kwargs) | ||
|
||
@property | ||
def chunksizes(self) -> Mapping[Hashable, tuple[int, ...]]: | ||
""" | ||
Mapping from group paths to a mapping of dimension names to block lengths for this dataset's data, or None if | ||
the underlying data is not a dask array. | ||
|
||
Cannot be modified directly, but can be modified by calling .chunk(). | ||
|
||
See Also | ||
-------- | ||
DataTree.chunk | ||
Dataset.chunksizes | ||
""" | ||
return { | ||
node.path: get_chunksizes(node.variables.values()) for node in self.subtree | ||
} | ||
|
||
def chunk( | ||
self, | ||
chunks: T_ChunksFreq = {}, # noqa: B006 # {} even though it's technically unsafe, is being used intentionally here (#4667) | ||
name_prefix: str = "xarray-", | ||
token: str | None = None, | ||
lock: bool = False, | ||
inline_array: bool = False, | ||
chunked_array_type: str | ChunkManagerEntrypoint | None = None, | ||
from_array_kwargs=None, | ||
**chunks_kwargs: T_ChunkDimFreq, | ||
) -> Self: | ||
"""Coerce all arrays in all groups in this tree into dask arrays with the given | ||
chunks. | ||
|
||
Non-dask arrays in this tree will be converted to dask arrays. Dask | ||
arrays will be rechunked to the given chunk sizes. | ||
|
||
If neither chunks is not provided for one or more dimensions, chunk | ||
sizes along that dimension will not be updated; non-dask arrays will be | ||
converted into dask arrays with a single block. | ||
|
||
Along datetime-like dimensions, a :py:class:`groupers.TimeResampler` object is also accepted. | ||
|
||
Parameters | ||
---------- | ||
chunks : int, tuple of int, "auto" or mapping of hashable to int or a TimeResampler, optional | ||
Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, or | ||
``{"x": 5, "y": 5}`` or ``{"x": 5, "time": TimeResampler(freq="YE")}``. | ||
name_prefix : str, default: "xarray-" | ||
Prefix for the name of any new dask arrays. | ||
token : str, optional | ||
Token uniquely identifying this dataset. | ||
lock : bool, default: False | ||
Passed on to :py:func:`dask.array.from_array`, if the array is not | ||
already as dask array. | ||
inline_array: bool, default: False | ||
Passed on to :py:func:`dask.array.from_array`, if the array is not | ||
already as dask array. | ||
chunked_array_type: str, optional | ||
Which chunked array type to coerce this datasets' arrays to. | ||
Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. | ||
Experimental API that should not be relied upon. | ||
from_array_kwargs: dict, optional | ||
Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create | ||
chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. | ||
For example, with dask as the default chunked array type, this method would pass additional kwargs | ||
to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. | ||
**chunks_kwargs : {dim: chunks, ...}, optional | ||
The keyword arguments form of ``chunks``. | ||
One of chunks or chunks_kwargs must be provided | ||
|
||
Returns | ||
------- | ||
chunked : xarray.DataTree | ||
|
||
See Also | ||
-------- | ||
Dataset.chunk | ||
Dataset.chunksizes | ||
xarray.unify_chunks | ||
dask.array.from_array | ||
""" | ||
# don't support deprecated ways of passing chunks | ||
if not isinstance(chunks, Mapping): | ||
raise TypeError( | ||
f"invalid type for chunks: {type(chunks)}. Only mappings are supported." | ||
) | ||
combined_chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") | ||
|
||
rechunked_groups = { | ||
path: node.dataset.chunk( | ||
{ | ||
dim: size | ||
for dim, size in combined_chunks.items() | ||
if dim in node.dataset.dims | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. otherwise I just saw There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That will also be more explicit as it will avoid using "rebuilt" dims (which doesn't really matter anyway because we can't chunk indexes). |
||
}, | ||
name_prefix=name_prefix, | ||
token=token, | ||
lock=lock, | ||
inline_array=inline_array, | ||
chunked_array_type=chunked_array_type, | ||
from_array_kwargs=from_array_kwargs, | ||
) | ||
for path, node in self.subtree_with_keys | ||
} | ||
|
||
return DataTree.from_dict(rechunked_groups, name=self.name) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch!