diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ea3a12aae0d..2d67b98d67e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,9 +24,19 @@ Breaking changes Enhancements ~~~~~~~~~~~~ +- Upsampling an array via interpolation with resample is now dask-compatible, + as long as the array is not chunked along the resampling dimension. + By `Spencer Clark `_. + Bug fixes ~~~~~~~~~ +- Interpolating via resample now internally specifies ``bounds_error=False`` + as an argument to ``scipy.interpolate.interp1d``, allowing for interpolation + from higher frequencies to lower frequencies. Datapoints outside the bounds + of the original time coordinate are now filled with NaN (:issue:`2197`). By + `Spencer Clark `_. + .. _whats-new.0.11.1: v0.11.1 (29 December 2018) diff --git a/xarray/core/resample.py b/xarray/core/resample.py index 49351efc70f..886303db345 100644 --- a/xarray/core/resample.py +++ b/xarray/core/resample.py @@ -2,7 +2,6 @@ from . import ops from .groupby import DEFAULT_DIMS, DataArrayGroupBy, DatasetGroupBy -from .pycompat import OrderedDict, dask_array_type RESAMPLE_DIM = '__resample_dim__' @@ -110,7 +109,16 @@ def interpolate(self, kind='linear'): return self._interpolate(kind=kind) def _interpolate(self, kind='linear'): - raise NotImplementedError + """Apply scipy.interpolate.interp1d along resampling dimension.""" + # drop any existing non-dimension coordinates along the resampling + # dimension + dummy = self._obj.copy() + for k, v in self._obj.coords.items(): + if k != self._dim and self._dim in v.dims: + dummy = dummy.drop(k) + return dummy.interp(assume_sorted=True, method=kind, + kwargs={'bounds_error': False}, + **{self._dim: self._full_index}) class DataArrayResample(DataArrayGroupBy, Resample): @@ -182,46 +190,6 @@ def apply(self, func, shortcut=False, args=(), **kwargs): return combined - def _interpolate(self, kind='linear'): - """Apply scipy.interpolate.interp1d along resampling dimension.""" - from .dataarray import DataArray - from scipy.interpolate import interp1d - - if isinstance(self._obj.data, dask_array_type): - raise TypeError( - "Up-sampling via interpolation was attempted on the the " - "variable '{}', but it is a dask array; dask arrays are not " - "yet supported in resample.interpolate(). Load into " - "memory with Dataset.load() before resampling." - .format(self._obj.data.name) - ) - - x = self._obj[self._dim].astype('float') - y = self._obj.data - - axis = self._obj.get_axis_num(self._dim) - - f = interp1d(x, y, kind=kind, axis=axis, bounds_error=True, - assume_sorted=True) - new_x = self._full_index.values.astype('float') - - # construct new up-sampled DataArray - dummy = self._obj.copy() - dims = dummy.dims - - # drop any existing non-dimension coordinates along the resampling - # dimension - coords = OrderedDict() - for k, v in dummy.coords.items(): - # is the resampling dimension - if k == self._dim: - coords[self._dim] = self._full_index - # else, check if resampling dim is in coordinate dimensions - elif self._dim not in v.dims: - coords[k] = v - return DataArray(f(new_x), coords, dims, name=dummy.name, - attrs=dummy.attrs) - ops.inject_reduce_methods(DataArrayResample) ops.inject_binary_ops(DataArrayResample) @@ -308,50 +276,6 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs): return super(DatasetResample, self).reduce( func, dim, keep_attrs, **kwargs) - def _interpolate(self, kind='linear'): - """Apply scipy.interpolate.interp1d along resampling dimension.""" - from .dataset import Dataset - from .variable import Variable - from scipy.interpolate import interp1d - - old_times = self._obj[self._dim].astype(float) - new_times = self._full_index.values.astype(float) - - data_vars = OrderedDict() - coords = OrderedDict() - - # Apply the interpolation to each DataArray in our original Dataset - for name, variable in self._obj.variables.items(): - if name in self._obj.coords: - if name == self._dim: - coords[self._dim] = self._full_index - elif self._dim not in variable.dims: - coords[name] = variable - else: - if isinstance(variable.data, dask_array_type): - raise TypeError( - "Up-sampling via interpolation was attempted on the " - "variable '{}', but it is a dask array; dask arrays " - "are not yet supprted in resample.interpolate(). Load " - "into memory with Dataset.load() before resampling." - .format(name) - ) - - axis = variable.get_axis_num(self._dim) - - # We've previously checked for monotonicity along the - # re-sampling dimension (in __init__ via the GroupBy - # constructor), so we can avoid sorting the data again by - # passing 'assume_sorted=True' - f = interp1d(old_times, variable.data, kind=kind, - axis=axis, bounds_error=True, - assume_sorted=True) - interpolated = Variable(variable.dims, f(new_times)) - - data_vars[name] = interpolated - - return Dataset(data_vars, coords) - ops.inject_reduce_methods(DatasetResample) ops.inject_binary_ops(DatasetResample) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 9b4f8523178..aa02e802fc5 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2524,6 +2524,16 @@ def test_upsample_interpolate(self): # done here due to floating point arithmetic assert_allclose(expected, actual, rtol=1e-16) + @requires_scipy + def test_upsample_interpolate_bug_2197(self): + dates = pd.date_range('2007-02-01', '2007-03-01', freq='D') + da = xr.DataArray(np.arange(len(dates)), [('time', dates)]) + result = da.resample(time='M').interpolate('linear') + expected_times = np.array([np.datetime64('2007-02-28'), + np.datetime64('2007-03-31')]) + expected = xr.DataArray([27., np.nan], [('time', expected_times)]) + assert_equal(result, expected) + @requires_scipy def test_upsample_interpolate_regression_1605(self): dates = pd.date_range('2016-01-01', '2016-03-31', freq='1D') @@ -2536,21 +2546,42 @@ def test_upsample_interpolate_regression_1605(self): @requires_dask @requires_scipy def test_upsample_interpolate_dask(self): - import dask.array as da - - times = pd.date_range('2000-01-01', freq='6H', periods=5) + from scipy.interpolate import interp1d xs = np.arange(6) ys = np.arange(3) + times = pd.date_range('2000-01-01', freq='6H', periods=5) z = np.arange(5)**2 - data = da.from_array(np.tile(z, (6, 3, 1)), (1, 3, 1)) + data = np.tile(z, (6, 3, 1)) array = DataArray(data, {'time': times, 'x': xs, 'y': ys}, ('x', 'y', 'time')) + chunks = {'x': 2, 'y': 1} + + expected_times = times.to_series().resample('1H').asfreq().index + # Split the times into equal sub-intervals to simulate the 6 hour + # to 1 hour up-sampling + new_times_idx = np.linspace(0, len(times) - 1, len(times) * 5) + for kind in ['linear', 'nearest', 'zero', 'slinear', 'quadratic', + 'cubic']: + actual = array.chunk(chunks).resample(time='1H').interpolate(kind) + actual = actual.compute() + f = interp1d(np.arange(len(times)), data, kind=kind, axis=-1, + bounds_error=True, assume_sorted=True) + expected_data = f(new_times_idx) + expected = DataArray(expected_data, + {'time': expected_times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + # Use AllClose because there are some small differences in how + # we upsample timeseries versus the integer indexing as I've + # done here due to floating point arithmetic + assert_allclose(expected, actual, rtol=1e-16) - with raises_regex(TypeError, - "dask arrays are not yet supported"): - array.resample(time='1H').interpolate('linear') + # Check that an error is raised if an attempt is made to interpolate + # over a chunked dimension + with raises_regex(NotImplementedError, + 'Chunking along the dimension to be interpolated'): + array.chunk({'time': 1}).resample(time='1H').interpolate('linear') def test_align(self): array = DataArray(np.random.random((6, 8)),