diff --git a/xarray/core/common.py b/xarray/core/common.py index 508a19b7115..d1766793b46 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -657,6 +657,7 @@ def resample(self, freq=None, dim=None, how=None, skipna=None, """ # TODO support non-string indexer after removing the old API. + from ..coding.cftime_offsets import cftime_range from .dataarray import DataArray from .resample import RESAMPLE_DIM from ..coding.cftimeindex import CFTimeIndex @@ -690,20 +691,35 @@ def resample(self, freq=None, dim=None, how=None, skipna=None, "was passed %r" % dim) if isinstance(self.indexes[dim_name], CFTimeIndex): - raise NotImplementedError( - 'Resample is currently not supported along a dimension ' - 'indexed by a CFTimeIndex. For certain kinds of downsampling ' - 'it may be possible to work around this by converting your ' - 'time index to a DatetimeIndex using ' - 'CFTimeIndex.to_datetimeindex. Use caution when doing this ' - 'however, because switching to a DatetimeIndex from a ' - 'CFTimeIndex with a non-standard calendar entails a change ' - 'in the calendar type, which could lead to subtle and silent ' - 'errors.' - ) + from ..coding.cftime_offsets import to_offset + from .resample_cftime import (_get_time_bins, _offset_timedelta, + _adjust_binner_for_upsample) + offset = to_offset(freq) + times = self.indexes[dim_name] + binner, labels = _get_time_bins(self.indexes[dim_name], + offset, + closed, label, base) + if times.size > labels.size: + # if we're downsampling CFTimeIndex, do this: + if closed == 'right': + fill_method = 'bfill' + else: + fill_method = 'ffill' + binner = (pd.Series(binner, index=binner) + .reindex(times, method=fill_method)) + bin_actual = np.unique(binner.values) + label_dict = dict(zip(bin_actual, labels.values)) + # np.unique returns --sorted-- unique values + binner = binner.map(label_dict) + grouper = ('downsampling', pd.Index(labels), binner) + else: + # if we're upsampling CFTimeIndex, do this: + binner = _adjust_binner_for_upsample(binner, closed) + grouper = ('upsampling', pd.Index(labels), binner, closed) + else: + grouper = pd.Grouper(freq=freq, closed=closed, label=label, base=base) group = DataArray(dim, [(dim.dims, dim)], name=RESAMPLE_DIM) - grouper = pd.Grouper(freq=freq, closed=closed, label=label, base=base) resampler = self._resample_cls(self, group=group, dim=dim_name, grouper=grouper, resample_dim=RESAMPLE_DIM) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index defe72ab3ee..5130c21a7b5 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -234,7 +234,29 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, # TODO: sort instead of raising an error raise ValueError('index must be monotonic for resampling') s = pd.Series(np.arange(index.size), index) - first_items = s.groupby(grouper).first() + if isinstance(grouper, tuple): + if grouper[0] == 'downsampling': + # if we're downsampling CFTimeIndex, do this: + labels = grouper[1] + binner = grouper[2] + first_items = s.groupby(binner).first().reindex(labels) + # reindex(grouper[1]) adds empty np.nan bins to + # emulate pandas behavior + elif grouper[0] == 'upsampling': + # if we're upsampling CFTimeIndex, do this: + labels = grouper[1] + binner = grouper[2] + closed = grouper[3] + if closed == 'right': + first_items = s.reindex(pd.Index(binner), + method='nearest') + first_items.index = labels + else: + first_items = s.reindex(pd.Index(binner), + method='bfill') + first_items.index = labels + else: + first_items = s.groupby(grouper).first() full_index = first_items.index if first_items.isnull().any(): first_items = first_items.dropna() diff --git a/xarray/core/resample.py b/xarray/core/resample.py index edf7dfc3d41..b7dfb5c5cd5 100644 --- a/xarray/core/resample.py +++ b/xarray/core/resample.py @@ -194,14 +194,35 @@ def _interpolate(self, kind='linear'): .format(self._obj.data.name) ) - x = self._obj[self._dim].astype('float') + from ..coding.cftimeindex import CFTimeIndex + import cftime as cf + import numpy as np + if isinstance(self._obj[self._dim].values[0], cf.datetime): + t = self._obj[self._dim] + x = np.insert([td.total_seconds() for td in + t[1:].values - t[:-1].values], 0, 0).cumsum() + # calling total_seconds is potentially bad for performance + x = x.round() + # Rounding fixes erroneous microsecond offsets in timedelta + # (fault of CFTime), but destroys microsecond resolution data + else: + x = self._obj[self._dim].astype('float') y = self._obj.data axis = self._obj.get_axis_num(self._dim) f = interp1d(x, y, kind=kind, axis=axis, bounds_error=True, assume_sorted=True) - new_x = self._full_index.values.astype('float') + if isinstance(self._full_index.values[0], cf.datetime): + t = self._full_index + new_x = np.insert([td.total_seconds() for td in + t[1:].values - t[:-1].values], 0, 0).cumsum() + # calling total_seconds is potentially bad for performance + new_x = new_x.round() + # Rounding fixes erroneous microsecond offsets in timedelta + # (fault of CFTime), but destroys microsecond resolution data + else: + new_x = self._full_index.values.astype('float') # construct new up-sampled DataArray dummy = self._obj.copy() diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py new file mode 100644 index 00000000000..0ccbb7163b2 --- /dev/null +++ b/xarray/core/resample_cftime.py @@ -0,0 +1,151 @@ +""" +CFTimeIndex port of pandas resampling +(pandas/pandas/core/resample.py) +Does not support non-integer freq +""" +from __future__ import absolute_import, division, print_function + +import datetime +from ..coding.cftimeindex import CFTimeIndex +from ..coding.cftime_offsets import (cftime_range, normalize_date, + Day, Hour, Minute, Second) + + +def _get_time_bins(index, freq, closed, label, base): + # This portion of code comes from TimeGrouper __init__ # + end_types = {'M', 'A'} + if freq._freq in end_types: + if closed is None: + closed = 'right' + if label is None: + label = 'right' + else: + if closed is None: + closed = 'left' + if label is None: + label = 'left' + # This portion of code comes from TimeGrouper __init__ # + + if not isinstance(index, CFTimeIndex): + raise TypeError('index must be a CFTimeIndex, but got ' + 'an instance of %r' % type(index).__name__) + if len(index) == 0: + binner = labels = CFTimeIndex(data=[], name=index.name) + return binner, [], labels + + first, last = _get_range_edges(index.min(), index.max(), freq, + closed=closed, + base=base) + binner = labels = cftime_range(freq=freq, + start=first, + end=last, + name=index.name) + + if len(binner) > 1 and binner[-1] < last: + extra_date_range = cftime_range(binner[-1], last + freq, + freq=freq, name=index.name) + binner = labels = CFTimeIndex(binner.append(extra_date_range[1:])) + + trimmed = False + if len(binner) > 2 and binner[-2] == last and closed == 'right': + binner = binner[:-1] + trimmed = True + + if closed == 'right': + labels = binner + if label == 'right': + labels = labels[1:] + elif not trimmed: + labels = labels[:-1] + else: + if label == 'right': + labels = labels[1:] + elif not trimmed: + labels = labels[:-1] + return binner, labels + + +def _adjust_bin_edges(binner, ax_values, freq): + # Some hacks for > daily data, see #1471, #1458, #1483 + if freq._freq not in ['D', 'H', 'T', 'min', 'S']: + # intraday values on last day + if binner[-2] > ax_values.max(): + binner = binner[:-1] + return binner + + +def _get_range_edges(first, last, offset, closed='left', base=0): + if offset._freq in ['D', 'H', 'T', 'min', 'S']: + is_day = isinstance(offset, Day) + if (is_day and offset.n == 1) or not is_day: + return _adjust_dates_anchored(first, last, offset, + closed=closed, base=base) + else: + first = normalize_date(first) + last = normalize_date(last) + + if closed == 'left': + first = offset.rollback(first) + else: + first = first - offset + + last = last + offset + return first, last + + +def _adjust_dates_anchored(first, last, offset, closed='right', base=0): + base = base % offset.n + start_day = normalize_date(first) + base_td = datetime.timedelta(0) + if offset._freq == 'D': + base_td = datetime.timedelta(days=base) + elif offset._freq == 'H': + base_td = datetime.timedelta(hours=base) + elif offset._freq in ['T', 'min']: + base_td = datetime.timedelta(minutes=base) + elif offset._freq == 'S': + base_td = datetime.timedelta(seconds=base) + offset_td = _offset_timedelta(offset) + start_day += base_td + foffset = (first - start_day) % offset_td + loffset = (last - start_day) % offset_td + if closed == 'right': + if foffset.total_seconds() > 0: + fresult = first - foffset + else: + fresult = first - offset_td + + if loffset.total_seconds() > 0: + lresult = last + (offset_td - loffset) + else: + lresult = last + else: + if foffset.total_seconds() > 0: + fresult = first - foffset + else: + fresult = first + + if loffset.total_seconds() > 0: + lresult = last + (offset_td - loffset) + else: + lresult = last + offset_td + return fresult, lresult + + +def _offset_timedelta(offset): + if isinstance(offset, Day): + return datetime.timedelta(days=offset.n) + elif isinstance(offset, Hour): + return datetime.timedelta(hours=offset.n) + elif isinstance(offset, Minute): + return datetime.timedelta(minutes=offset.n) + elif isinstance(offset, Second): + return datetime.timedelta(seconds=offset.n) + + +def _adjust_binner_for_upsample(binner, closed): + if closed == 'right': + binner = binner[1:] + else: + binner = binner[:-1] + return binner diff --git a/xarray/tests/temp/cftime_resample_pandas_comparison.py b/xarray/tests/temp/cftime_resample_pandas_comparison.py new file mode 100644 index 00000000000..a63b3a2ca54 --- /dev/null +++ b/xarray/tests/temp/cftime_resample_pandas_comparison.py @@ -0,0 +1,118 @@ +import xarray as xr +import pandas as pd +import numpy as np + + +# Equal sampling comparisons: +ti = pd.date_range('2000-01-01', periods=9, freq='T', tz='UTC') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('T', label='left', closed='left').mean()) +print(ps.resample('T', label='right', closed='left').mean()) +print(ps.resample('T', label='left', closed='right').mean()) +print(ps.resample('T', label='right', closed='right').mean()) +print(ps.resample('60S', label='left', closed='left').mean()) +print(ps.resample('60S', label='right', closed='left').mean()) +print(ps.resample('60S', label='left', closed='right').mean()) +print(ps.resample('60S', label='right', closed='right').mean()) +ti = pd.date_range('2000', periods=30, freq='MS', tz='UTC') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('M', label='left', closed='left').max()) +print(ps.resample('M', label='right', closed='left').max()) +print(ps.resample('M', label='left', closed='right').max()) +print(ps.resample('M', label='right', closed='right').max()) +print(ps.resample('MS', label='left', closed='left').max()) +print(ps.resample('MS', label='right', closed='left').max()) +print(ps.resample('MS', label='left', closed='right').max()) +print(ps.resample('MS', label='right', closed='right').max()) + + +# Downsampling comparisons: +ti = pd.date_range('2000-01-01', periods=9, freq='MS', tz='UTC') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('3M', label='left', closed='left').max()) +print(ps.resample('3M', label='right', closed='left').max()) +print(ps.resample('3M', label='left', closed='right').max()) +print(ps.resample('3M', label='right', closed='right').max()) +print(ps.resample('3MS', label='left', closed='left').max()) +print(ps.resample('3MS', label='right', closed='left').max()) +print(ps.resample('3MS', label='left', closed='right').max()) +print(ps.resample('3MS', label='right', closed='right').max()) +print(ps.resample('3M', label='left', closed='left').mean()) +print(ps.resample('3M', label='right', closed='left').mean()) +print(ps.resample('3M', label='left', closed='right').mean()) +print(ps.resample('3M', label='right', closed='right').mean()) +print(ps.resample('3MS', label='left', closed='left').mean()) +print(ps.resample('3MS', label='right', closed='left').mean()) +print(ps.resample('3MS', label='left', closed='right').mean()) +print(ps.resample('3MS', label='right', closed='right').mean()) +print(ps.resample('2M', label='left', closed='left').mean()) +print(ps.resample('2M', label='right', closed='left').mean()) +print(ps.resample('2M', label='left', closed='right').mean()) +print(ps.resample('2M', label='right', closed='right').mean()) +print(ps.resample('2MS', label='left', closed='left').mean()) +print(ps.resample('2MS', label='right', closed='left').mean()) +print(ps.resample('2MS', label='left', closed='right').mean()) +print(ps.resample('2MS', label='right', closed='right').mean()) +# Checking how label and closed args affect outputs +ti = pd.date_range('2000-01-01', periods=9, freq='T', tz='UTC') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('3T', label='left', closed='left').mean()) +print(ps.resample('3T', label='right', closed='left').mean()) +print(ps.resample('3T', label='left', closed='right').mean()) +print(ps.resample('3T', label='right', closed='right').mean()) +ti = pd.date_range('2000', periods=30, freq='MS') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('6MS', label='left', closed='left').max()) +print(ps.resample('6MS', label='right', closed='left').max()) +print(ps.resample('6MS', label='left', closed='right').max()) +print(ps.resample('6MS', label='right', closed='right').max()) +# Checking different aggregation funcs, also checking cases when label and closed == None +ti = pd.date_range('2000', periods=30, freq='MS') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('MS').mean()) # essentially doing no resampling, should return original data +print(ps.resample('6MS').mean()) +print(ps.resample('6MS').asfreq()) # results do not match since xarray makes asfreq = mean (see resample.py) +print(ps.resample('6MS').sum()) +print(ps.resample('6MS').min()) +print(ps.resample('6MS').max()) + + +# Upsampling comparisons: +# At seconds-resolution, xr.cftime_range is 1 second off from pd.date_range +ti = pd.date_range('2011-01-01T13:02:03', '2012-01-01T00:00:00', freq='D', tz='UTC') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('12T', base=0).interpolate().index) # testing T vs min +print(ps.resample('12min', base=0).interpolate().index) +print(ps.resample('12min', base=1).interpolate().index) +print(ps.resample('12min', base=5).interpolate().index) +print(ps.resample('12min', base=17).interpolate().index) +print(ps.resample('12S', base=17).interpolate().index) +print(ps.resample('1D', base=0).interpolate().values) # essentially doing no resampling, should return original data +print(ps.resample('1D', base=0).mean().values) # essentially doing no resampling, should return original data +# Pandas' upsampling behave aberrantly if start times for dates are not neat, should we replicate? +ti = pd.date_range('2000-01-01T13:02:03', '2000-02-01T00:00:00', freq='D', tz='UTC') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) # results unchanged if array of floats used +print(ps.resample('8H', base=0, closed='left').interpolate().values) +print(ps.resample('8H', base=0, closed='left').sum().values) +print(ps.resample('8H', base=0, closed='left').mean().values) +print(ps.resample('8H', base=0, closed='right').interpolate().values) +print(ps.resample('8H', base=0, closed='right').sum().values) +print(ps.resample('8H', base=0, closed='right').mean().values) +# Neat start times (00:00:00) produces expected behavior when upsampling with pandas +ti = pd.date_range('2000-01-01T00:00:00', '2000-02-01T00:00:00', freq='D', tz='UTC') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('12T', base=0).interpolate()) +print(ps.resample('12T', base=24, closed='left').interpolate()) +print(ps.resample('12T', base=24, closed='right', label='left').interpolate()) +ti = pd.date_range('2000', periods=30, freq='MS') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('D').interpolate()) + + +# Shows how resample-apply produces different results with Series and DataArray +ti = pd.date_range('2000', periods=30, freq='MS') +da = xr.DataArray(np.arange(100, 100+ti.size), [('time', ti)]) +print(da.resample(time='6MS').sum()) +ti = pd.date_range('2000', periods=30, freq='MS') +ps = pd.Series(np.arange(100, 100+ti.size), index=ti) +print(ps.resample('6MS').sum()) diff --git a/xarray/tests/temp/cftime_resample_tests.py b/xarray/tests/temp/cftime_resample_tests.py new file mode 100644 index 00000000000..6357365dada --- /dev/null +++ b/xarray/tests/temp/cftime_resample_tests.py @@ -0,0 +1,152 @@ +import xarray as xr +import numpy as np + + +# Equal sampling comparisons: +times = xr.cftime_range('2000-01-01', periods=9, freq='T', tz='UTC') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='T', label='left', closed='left').mean()) +print(da.resample(time='T', label='right', closed='left').mean()) +print(da.resample(time='T', label='left', closed='right').mean()) +print(da.resample(time='T', label='right', closed='right').mean()) +print(da.resample(time='60S', label='left', closed='left').mean()) +print(da.resample(time='60S', label='right', closed='left').mean()) +print(da.resample(time='60S', label='left', closed='right').mean()) +print(da.resample(time='60S', label='right', closed='right').mean()) +times = xr.cftime_range('2000', periods=30, freq='MS') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='M', label='left', closed='left').max()) +print(da.resample(time='M', label='right', closed='left').max()) +print(da.resample(time='M', label='left', closed='right').max()) +print(da.resample(time='M', label='right', closed='right').max()) +print(da.resample(time='MS', label='left', closed='left').max()) +print(da.resample(time='MS', label='right', closed='left').max()) +print(da.resample(time='MS', label='left', closed='right').max()) +print(da.resample(time='MS', label='right', closed='right').max()) + + +# Downsampling comparisons: +times = xr.cftime_range('2000-01-01', periods=9, freq='MS', tz='UTC') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='3M', label='left', closed='left').max()) +print(da.resample(time='3M', label='right', closed='left').max()) +print(da.resample(time='3M', label='left', closed='right').max()) +print(da.resample(time='3M', label='right', closed='right').max()) +print(da.resample(time='3MS', label='left', closed='left').max()) +print(da.resample(time='3MS', label='right', closed='left').max()) +print(da.resample(time='3MS', label='left', closed='right').max()) +print(da.resample(time='3MS', label='right', closed='right').max()) +print(da.resample(time='3M', label='left', closed='left').mean()) +print(da.resample(time='3M', label='right', closed='left').mean()) +print(da.resample(time='3M', label='left', closed='right').mean()) +print(da.resample(time='3M', label='right', closed='right').mean()) +print(da.resample(time='3MS', label='left', closed='left').mean()) +print(da.resample(time='3MS', label='right', closed='left').mean()) +print(da.resample(time='3MS', label='left', closed='right').mean()) +print(da.resample(time='3MS', label='right', closed='right').mean()) +print(da.resample(time='2M', label='left', closed='left').mean()) +print(da.resample(time='2M', label='right', closed='left').mean()) +print(da.resample(time='2M', label='left', closed='right').mean()) +print(da.resample(time='2M', label='right', closed='right').mean()) +print(da.resample(time='2MS', label='left', closed='left').mean()) +print(da.resample(time='2MS', label='right', closed='left').mean()) +print(da.resample(time='2MS', label='left', closed='right').mean()) +print(da.resample(time='2MS', label='right', closed='right').mean()) +# Checking how label and closed args affect outputs +times = xr.cftime_range('2000-01-01', periods=9, freq='T', tz='UTC') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='3T', label='left', closed='left').mean()) +print(da.resample(time='3T', label='right', closed='left').mean()) +print(da.resample(time='3T', label='left', closed='right').mean()) +print(da.resample(time='3T', label='right', closed='right').mean()) +times = xr.cftime_range('2000', periods=30, freq='MS') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='6MS', label='left', closed='left').max()) +print(da.resample(time='6MS', label='right', closed='left').max()) +print(da.resample(time='6MS', label='left', closed='right').max()) +print(da.resample(time='6MS', label='right', closed='right').max()) +# Checking different aggregation funcs, also checking cases when label and closed == None +times = xr.cftime_range('2000', periods=30, freq='MS') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='MS').mean()) # essentially doing no resampling, should return original data +print(da.resample(time='6MS').mean()) +print(da.resample(time='6MS').asfreq()) # results do not match since xarray makes asfreq = mean (see resample.py) +print(da.resample(time='6MS').sum()) +print(da.resample(time='6MS').min()) +print(da.resample(time='6MS').max()) + + +# Upsampling comparisons: +# At seconds-resolution, xr.cftime_range is 1 second off from pd.date_range +times = xr.cftime_range('2011-01-01T13:02:03', '2012-01-01T00:00:00', freq='D') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='12T', base=0).interpolate().indexes) # testing 'T' vs 'min' +print(da.resample(time='12min', base=0).interpolate().indexes) +print(da.resample(time='12min', base=1).interpolate().indexes) +print(da.resample(time='12min', base=5).mean().indexes) +print(da.resample(time='12min', base=17).mean().indexes) +print(da.resample(time='12S', base=17).interpolate().indexes) +print(da.resample(time='1D', base=0).interpolate().values) # essentially doing no resampling, should return original data +print(da.resample(time='1D', base=0).mean().values) # essentially doing no resampling, should return original data +# Upsampling with non 00:00:00 dates. Sum and mean matches pandas behavior but interpolate doesn't. +times = xr.cftime_range('2000-01-01T13:02:03', '2000-02-01T00:00:00', freq='D') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='8H', base=0, closed='left').interpolate().values) +print(da.resample(time='8H', base=0, closed='left').sum().values) +print(da.resample(time='8H', base=0, closed='left').mean().values) +print(da.resample(time='8H', base=0, closed='right').interpolate().values) +print(da.resample(time='8H', base=0, closed='right').sum().values) +print(da.resample(time='8H', base=0, closed='right').mean().values) +# Neat start times (00:00:00) produces behavior matching pandas' +times = xr.cftime_range('2000-01-01T00:00:00', '2000-02-01T00:00:00', freq='D') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='12T', base=0).interpolate()) +print(da.resample(time='12T', base=24, closed='left').interpolate()) +print(da.resample(time='12T', base=24, closed='right', label='left').interpolate()) +times = xr.cftime_range('2000', periods=30, freq='MS') +da = xr.DataArray(np.arange(100, 100+times.size), [('time', times)]) +print(da.resample(time='D').interpolate()) + + +# Check that Dataset and DataArray returns the same resampling results +times = xr.cftime_range('2000-01-01', periods=9, freq='T', tz='UTC') +ds = xr.Dataset(data_vars={'data1': ('time', np.arange(100, 100+times.size)), + 'data2': ('time', np.arange(500, 500+times.size))}, + coords={'time': times}) +print(ds.resample(time='3T', label='left', closed='left').mean()) +print(ds.resample(time='3T', label='right', closed='left').mean()) +print(ds.resample(time='3T', label='left', closed='right').mean()) +print(ds.resample(time='3T', label='right', closed='right').mean()) +times = xr.cftime_range('2000', periods=30, freq='MS') +ds = xr.Dataset(data_vars={'data1': ('time', np.arange(100, 100+times.size)), + 'data2': ('time', np.arange(500, 500+times.size))}, + coords={'time': times}) +print(ds.resample(time='6MS', label='left', closed='left').max()) +print(ds.resample(time='6MS', label='right', closed='left').max()) +print(ds.resample(time='6MS', label='left', closed='right').max()) +print(ds.resample(time='6MS', label='right', closed='right').max()) + + +# Check that nc files read as dask arrays can be resampled +# +import os +testfilepath = os.path.join(os.path.expanduser('~'), 'Dropbox', 'code', 'Ouranos', 'testdata', 'NRCANdaily', 'nrcan_canada_daily_tasmax_1990.nc') +xr.set_options(enable_cftimeindex=True) +test_ds = xr.open_dataset(testfilepath, chunks={'time': 10}) +test_ds['time'] = xr.cftime_range('1999-01-01', '1999-12-31', freq='D') # regular calendars are still read as pandas date_range even though enable_cftimeindex=True +# test_ds.fillna(0).resample(time='3MS') # NaN in results still present +print(test_ds.resample(time='MS').mean()) +monthly_avg = test_ds.resample(time='MS').mean() +monthly_avg.sel(lat=49.95833206176758, lon=-79.95833587646484).to_dataframe() +chunked_cftime_val = monthly_avg.sel(lat=49.95833206176758, lon=-79.95833587646484).to_dataframe().values + +xr.set_options(enable_cftimeindex=False) +test_ds = xr.open_dataset(testfilepath, chunks={'time': 10}) +# print(test_ds.resample(time='MS').interpolate()) # xarray does not allow dask upsampling +print(test_ds.resample(time='MS').mean()) +monthly_avg = test_ds.resample(time='MS').mean() +monthly_avg.sel(lat=49.95833206176758, lon=-79.95833587646484).to_dataframe() +chunked_pandas_val = monthly_avg.sel(lat=49.95833206176758, lon=-79.95833587646484).to_dataframe().values + +# assert(np.all(chunked_cftime_val == chunked_pandas_val)) +print(np.all(chunked_cftime_val == chunked_pandas_val)) diff --git a/xarray/tests/test_cftimeindex_resample.py b/xarray/tests/test_cftimeindex_resample.py new file mode 100644 index 00000000000..ea034ba248f --- /dev/null +++ b/xarray/tests/test_cftimeindex_resample.py @@ -0,0 +1,134 @@ +from __future__ import absolute_import + +import itertools +import pytest +import datetime as dt + +import numpy as np +import pandas as pd +import xarray as xr +from xarray.tests import assert_array_equal, assert_identical + + +@pytest.fixture() +def pd_index(): + return pd.date_range('2000-01-01', periods=30, freq='MS', tz='UTC') + + +@pytest.fixture() +def xr_index(): + return xr.cftime_range('2000-01-01', periods=30, freq='MS', tz='UTC') + + +@pytest.fixture() +def daily_pd_index(): + return pd.date_range('2000-01-01', periods=900, freq='D', tz='UTC') + + +@pytest.fixture() +def daily_xr_index(): + return xr.cftime_range('2000-01-01', periods=900, freq='D', tz='UTC') + + +@pytest.fixture() +def base_pd_index(): + return pd.date_range('2000-01-01', periods=30, freq='D', tz='UTC') + + +@pytest.fixture() +def base_xr_index(): + return xr.cftime_range('2000-01-01', periods=30, freq='D', tz='UTC') + + +def da(index): + return xr.DataArray(np.arange(100., 100.+index.size), + coords=[index], dims=['time']) + + +def series(index): + return pd.Series(np.arange(100., 100.+index.size), index=index) + + +@ pytest.mark.parametrize(('closed', 'label', 'freq'), + list(itertools.product( + ['left', 'right'], + ['left', 'right'], + ['2MS', '2M', '3MS', '3M', '7MS', '7M']))) +def test_downsampler(closed, label, freq): + downsamp_series = series(pd_index()).resample( + freq, closed=closed, label=label).mean().dropna() + downsamp_da = da(xr_index()).resample( + time=freq, closed=closed, label=label).mean().dropna(dim='time') + assert np.all(downsamp_series.values == downsamp_da.values) + assert np.all(downsamp_series.index.strftime('%Y-%m-%dT%T').values == + np.array([timestamp.strftime('%Y-%m-%dT%T') for + timestamp in downsamp_da.indexes['time']])) + + +@ pytest.mark.parametrize(('closed', 'label', 'freq'), + list(itertools.product( + ['left', 'right'], + ['left', 'right'], + ['2MS', '2M', '3MS', '3M', 'AS', 'A', '2AS', '2A']))) +def test_downsampler_daily(closed, label, freq): + downsamp_series = series(daily_pd_index()).resample( + freq, closed=closed, label=label).mean().dropna() + downsamp_da = da(daily_xr_index()).resample( + time=freq, closed=closed, label=label).mean().dropna(dim='time') + assert np.all(downsamp_series.values == downsamp_da.values) + assert np.all(downsamp_series.index.strftime('%Y-%m-%dT%T').values == + np.array([timestamp.strftime('%Y-%m-%dT%T') for + timestamp in downsamp_da.indexes['time']])) + + +@ pytest.mark.parametrize(('closed', 'label', 'freq'), + list(itertools.product( + ['left', 'right'], + ['left', 'right'], + ['MS', 'M', '7D', 'D']))) +def test_upsampler(closed, label, freq): + # the testing here covers cases of equal sampling as well + # for pandas --not xarray--, .ffill() and .bfill() gives + # error (mismatched length) + upsamp_series = series(pd_index()).resample( + freq, closed=closed, label=label).mean().fillna(0) + upsamp_da = da(xr_index()).resample( + time=freq, closed=closed, label=label).mean().fillna(0) + print(upsamp_series.values) + print(upsamp_da.values) + print(upsamp_series.index.strftime('%Y-%m-%dT%T').values) + print(np.array([timestamp.strftime('%Y-%m-%dT%T') for + timestamp in upsamp_da.indexes['time']])) + assert np.all(upsamp_series.values == upsamp_da.values) + assert np.all(upsamp_series.index.strftime('%Y-%m-%dT%T').values == + np.array([timestamp.strftime('%Y-%m-%dT%T') for + timestamp in upsamp_da.indexes['time']])) + + +@ pytest.mark.parametrize(('closed', 'label', 'base'), + list(itertools.product( + ['left', 'right'], + ['left', 'right'], + [1, 5, 12, 17, 24]))) +def test_upsampler_base(closed, label, base, freq='12H'): + upsamp_series = series(base_pd_index()).resample( + freq, closed=closed, label=label, base=base).mean().dropna() + upsamp_da = da(base_xr_index()).resample( + time=freq, closed=closed, + label=label, base=base).mean().dropna(dim='time') + # fix for cftime ranges that are 1 second off + time_ix = upsamp_da.indexes['time'].values + for ix in np.arange(len(time_ix)): + if time_ix[ix].second == 59: + time_ix[ix] += dt.timedelta(seconds=1) + upsamp_da = upsamp_da.assign_coords(time=time_ix) + # fix for cftime ranges that are 1 second off + print(upsamp_series.values) + print(upsamp_da.values) + print(upsamp_series.index.strftime('%Y-%m-%dT%T').values) + print(np.array([timestamp.strftime('%Y-%m-%dT%T') for + timestamp in upsamp_da.indexes['time']])) + assert np.all(upsamp_series.values == upsamp_da.values) + assert np.all(upsamp_series.index.strftime('%Y-%m-%dT%T').values == + np.array([timestamp.strftime('%Y-%m-%dT%T') for + timestamp in upsamp_da.indexes['time']]))