Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mfdataset, concat now support the 'join' kwarg. #3102

Merged
merged 13 commits into from
Aug 7, 2019
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ New functions/methods
Enhancements
~~~~~~~~~~~~

- :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg.
It is passed down to :py:func:`~xarray.align`. By `Deepak Cherian <https://github.com/dcherian>`_.
- In :py:meth:`~xarray.Dataset.to_zarr`, passing ``mode`` is not mandatory if
``append_dim`` is set, as it will automatically be set to ``'a'`` internally.
By `David Brochart <https://github.com/davidbrochart>`_.
Expand Down
20 changes: 16 additions & 4 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
compat='no_conflicts', preprocess=None, engine=None,
lock=None, data_vars='all', coords='different',
combine='_old_auto', autoclose=None, parallel=False,
**kwargs):
join='outer', **kwargs):
"""Open multiple files as a single dataset.

If combine='by_coords' then the function ``combine_by_coords`` is used to
Expand Down Expand Up @@ -704,6 +704,16 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
parallel : bool, optional
If True, the open and preprocess steps of this function will be
performed in parallel using ``dask.delayed``. Default is False.
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
String indicating how to combine differing indexes
(excluding concat_dim) in objects

- 'outer': use the union of object indexes
- 'inner': use the intersection of object indexes
- 'left': use indexes from the first object with each dimension
- 'right': use indexes from the last object with each dimension
- 'exact': instead of aligning, raise `ValueError` when indexes to be
aligned are not equal
**kwargs : optional
Additional arguments passed on to :py:func:`xarray.open_dataset`.

Expand Down Expand Up @@ -798,18 +808,20 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',

combined = auto_combine(datasets, concat_dim=concat_dim,
compat=compat, data_vars=data_vars,
coords=coords, from_openmfds=True)
coords=coords, join=join,
from_openmfds=True)
elif combine == 'nested':
# Combined nested list by successive concat and merge operations
# along each dimension, using structure given by "ids"
combined = _nested_combine(datasets, concat_dims=concat_dim,
compat=compat, data_vars=data_vars,
coords=coords, ids=ids)
coords=coords, ids=ids, join=join)
elif combine == 'by_coords':
# Redo ordering from coordinates, ignoring how they were ordered
# previously
combined = combine_by_coords(datasets, compat=compat,
data_vars=data_vars, coords=coords)
data_vars=data_vars, coords=coords,
join=join)
else:
raise ValueError("{} is an invalid option for the keyword argument"
" ``combine``".format(combine))
Expand Down
76 changes: 56 additions & 20 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _check_shape_tile_ids(combined_tile_ids):

def _combine_nd(combined_ids, concat_dims, data_vars='all',
coords='different', compat='no_conflicts',
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):
"""
Combines an N-dimensional structure of datasets into one by applying a
series of either concat and merge operations along each dimension.
Expand Down Expand Up @@ -177,13 +177,14 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all',
data_vars=data_vars,
coords=coords,
compat=compat,
fill_value=fill_value)
fill_value=fill_value,
join=join)
(combined_ds,) = combined_ids.values()
return combined_ds


def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat,
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):

# Group into lines of datasets which must be combined along dim
# need to sort by _new_tile_id first for groupby to work
Expand All @@ -197,12 +198,13 @@ def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat,
combined_ids = OrderedDict(sorted(group))
datasets = combined_ids.values()
new_combined_ids[new_id] = _combine_1d(datasets, dim, compat,
data_vars, coords, fill_value)
data_vars, coords, fill_value,
join)
return new_combined_ids


def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
coords='different', fill_value=dtypes.NA):
coords='different', fill_value=dtypes.NA, join='outer'):
"""
Applies either concat or merge to 1D list of datasets depending on value
of concat_dim
Expand All @@ -211,7 +213,7 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
if concat_dim is not None:
try:
combined = concat(datasets, dim=concat_dim, data_vars=data_vars,
coords=coords, fill_value=fill_value)
coords=coords, fill_value=fill_value, join=join)
except ValueError as err:
if "encountered unexpected variable" in str(err):
raise ValueError("These objects cannot be combined using only "
Expand All @@ -222,7 +224,8 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
else:
raise
else:
combined = merge(datasets, compat=compat, fill_value=fill_value)
combined = merge(datasets, compat=compat, fill_value=fill_value,
join=join)

return combined

Expand All @@ -233,7 +236,7 @@ def _new_tile_id(single_id_ds_pair):


def _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids,
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):

if len(datasets) == 0:
return Dataset()
Expand All @@ -254,12 +257,13 @@ def _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids,
# Apply series of concatenate or merge operations along each dimension
combined = _combine_nd(combined_ids, concat_dims, compat=compat,
data_vars=data_vars, coords=coords,
fill_value=fill_value)
fill_value=fill_value, join=join)
return combined


def combine_nested(datasets, concat_dim, compat='no_conflicts',
data_vars='all', coords='different', fill_value=dtypes.NA):
data_vars='all', coords='different', fill_value=dtypes.NA,
join='outer'):
"""
Explicitly combine an N-dimensional grid of datasets into one by using a
succession of concat and merge operations along each dimension of the grid.
Expand Down Expand Up @@ -312,6 +316,16 @@ def combine_nested(datasets, concat_dim, compat='no_conflicts',
Details are in the documentation of concat
fill_value : scalar, optional
Value to use for newly missing values
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
String indicating how to combine differing indexes
(excluding concat_dim) in objects

- 'outer': use the union of object indexes
- 'inner': use the intersection of object indexes
- 'left': use indexes from the first object with each dimension
- 'right': use indexes from the last object with each dimension
- 'exact': instead of aligning, raise `ValueError` when indexes to be
aligned are not equal

Returns
-------
Expand Down Expand Up @@ -383,15 +397,15 @@ def combine_nested(datasets, concat_dim, compat='no_conflicts',
# The IDs argument tells _manual_combine that datasets aren't yet sorted
return _nested_combine(datasets, concat_dims=concat_dim, compat=compat,
data_vars=data_vars, coords=coords, ids=False,
fill_value=fill_value)
fill_value=fill_value, join=join)


def vars_as_keys(ds):
return tuple(sorted(ds))


def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
coords='different', fill_value=dtypes.NA):
coords='different', fill_value=dtypes.NA, join='outer'):
"""
Attempt to auto-magically combine the given datasets into one by using
dimension coordinates.
Expand Down Expand Up @@ -439,6 +453,16 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
Details are in the documentation of concat
fill_value : scalar, optional
Value to use for newly missing values
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
String indicating how to combine differing indexes
(excluding concat_dim) in objects

- 'outer': use the union of object indexes
- 'inner': use the intersection of object indexes
- 'left': use indexes from the first object with each dimension
- 'right': use indexes from the last object with each dimension
- 'exact': instead of aligning, raise `ValueError` when indexes to be
aligned are not equal

Returns
-------
Expand Down Expand Up @@ -498,7 +522,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
# Concatenate along all of concat_dims one by one to create single ds
concatenated = _combine_nd(combined_ids, concat_dims=concat_dims,
data_vars=data_vars, coords=coords,
fill_value=fill_value)
fill_value=fill_value, join=join)

# Check the overall coordinates are monotonically increasing
for dim in concat_dims:
Expand All @@ -511,7 +535,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
concatenated_grouped_by_data_vars.append(concatenated)

return merge(concatenated_grouped_by_data_vars, compat=compat,
fill_value=fill_value)
fill_value=fill_value, join=join)


# Everything beyond here is only needed until the deprecation cycle in #2616
Expand All @@ -523,7 +547,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',

def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
data_vars='all', coords='different', fill_value=dtypes.NA,
from_openmfds=False):
join='outer', from_openmfds=False):
"""
Attempt to auto-magically combine the given datasets into one.

Expand Down Expand Up @@ -571,6 +595,16 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
Details are in the documentation of concat
fill_value : scalar, optional
Value to use for newly missing values
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
String indicating how to combine differing indexes
(excluding concat_dim) in objects

- 'outer': use the union of object indexes
- 'inner': use the intersection of object indexes
- 'left': use indexes from the first object with each dimension
- 'right': use indexes from the last object with each dimension
- 'exact': instead of aligning, raise `ValueError` when indexes to be
aligned are not equal

Returns
-------
Expand Down Expand Up @@ -629,7 +663,8 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',

return _old_auto_combine(datasets, concat_dim=concat_dim,
compat=compat, data_vars=data_vars,
coords=coords, fill_value=fill_value)
coords=coords, fill_value=fill_value,
join=join)


def _dimension_coords_exist(datasets):
Expand Down Expand Up @@ -670,7 +705,7 @@ def _requires_concat_and_merge(datasets):
def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,
compat='no_conflicts',
data_vars='all', coords='different',
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):
if concat_dim is not None:
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim

Expand All @@ -679,16 +714,17 @@ def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,

concatenated = [_auto_concat(list(datasets), dim=dim,
data_vars=data_vars, coords=coords,
fill_value=fill_value)
fill_value=fill_value, join=join)
for vars, datasets in grouped]
else:
concatenated = datasets
merged = merge(concatenated, compat=compat, fill_value=fill_value)
merged = merge(concatenated, compat=compat, fill_value=fill_value,
join=join)
return merged


def _auto_concat(datasets, dim=None, data_vars='all', coords='different',
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):
if len(datasets) == 1 and dim is None:
# There is nothing more to combine, so kick out early.
return datasets[0]
Expand Down
27 changes: 19 additions & 8 deletions xarray/core/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

def concat(objs, dim=None, data_vars='all', coords='different',
compat='equals', positions=None, indexers=None, mode=None,
concat_over=None, fill_value=dtypes.NA):
concat_over=None, fill_value=dtypes.NA, join='outer'):
"""Concatenate xarray objects along a new or existing dimension.

Parameters
Expand Down Expand Up @@ -52,7 +52,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
* 'all': All coordinate variables will be concatenated, except
those corresponding to other dimensions.
* list of str: The listed coordinate variables will be concatenated,
in addition the 'minimal' coordinates.
in addition to the 'minimal' coordinates.
compat : {'equals', 'identical'}, optional
String indicating how to compare non-concatenated variables and
dataset global attributes for potential conflicts. 'equals' means
Expand All @@ -65,6 +65,17 @@ def concat(objs, dim=None, data_vars='all', coords='different',
supplied, objects are concatenated in the provided order.
fill_value : scalar, optional
Value to use for newly missing values
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
String indicating how to combine differing indexes
(excluding dim) in objects

- 'outer': use the union of object indexes
- 'inner': use the intersection of object indexes
- 'left': use indexes from the first object with each dimension
- 'right': use indexes from the last object with each dimension
- 'exact': instead of aligning, raise `ValueError` when indexes to be
aligned are not equal

indexers, mode, concat_over : deprecated

Returns
Expand All @@ -76,7 +87,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
merge
auto_combine
"""
# TODO: add join and ignore_index arguments copied from pandas.concat
# TODO: add ignore_index arguments copied from pandas.concat
# TODO: support concatenating scalar coordinates even if the concatenated
# dimension already exists
from .dataset import Dataset
Expand Down Expand Up @@ -116,7 +127,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
else:
raise TypeError('can only concatenate xarray Dataset and DataArray '
'objects, got %s' % type(first_obj))
return f(objs, dim, data_vars, coords, compat, positions, fill_value)
return f(objs, dim, data_vars, coords, compat, positions, fill_value, join)


def _calc_concat_dim_coord(dim):
Expand Down Expand Up @@ -212,7 +223,7 @@ def process_subset_opt(opt, subset):


def _dataset_concat(datasets, dim, data_vars, coords, compat, positions,
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):
"""
Concatenate a sequence of datasets along a new or existing dimension
"""
Expand All @@ -225,7 +236,7 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions,
dim, coord = _calc_concat_dim_coord(dim)
# Make sure we're working on a copy (we'll be loading variables)
datasets = [ds.copy() for ds in datasets]
datasets = align(*datasets, join='outer', copy=False, exclude=[dim],
datasets = align(*datasets, join=join, copy=False, exclude=[dim],
fill_value=fill_value)

concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords)
Expand Down Expand Up @@ -318,7 +329,7 @@ def ensure_common_dims(vars):


def _dataarray_concat(arrays, dim, data_vars, coords, compat,
positions, fill_value=dtypes.NA):
positions, fill_value=dtypes.NA, join='outer'):
arrays = list(arrays)

if data_vars != 'all':
Expand All @@ -337,5 +348,5 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
datasets.append(arr._to_temp_dataset())

ds = _dataset_concat(datasets, dim, data_vars, coords, compat,
positions, fill_value=fill_value)
positions, fill_value=fill_value, join=join)
return arrays[0]._from_temp_dataset(ds, name)
9 changes: 8 additions & 1 deletion xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,14 @@ def merge(objects, compat='no_conflicts', join='outer', fill_value=dtypes.NA):
must be equal. The returned dataset then contains the combination
of all non-null values.
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
How to combine objects with different indexes.
String indicating how to combine differing indexes in objects.

- 'outer': use the union of object indexes
- 'inner': use the intersection of object indexes
- 'left': use indexes from the first object with each dimension
- 'right': use indexes from the last object with each dimension
- 'exact': instead of aligning, raise `ValueError` when indexes to be
aligned are not equal
fill_value : scalar, optional
Value to use for newly missing values

Expand Down
Loading