Ouranosinc · jwenfai · Dec 5, 2018 · Nov 9, 2018 · Nov 9, 2018 · Nov 9, 2018
diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -657,6 +657,7 @@ def resample(self, freq=None, dim=None, how=None, skipna=None,
         """
         # TODO support non-string indexer after removing the old API.
 
+        from ..coding.cftime_offsets import cftime_range
         from .dataarray import DataArray
         from .resample import RESAMPLE_DIM
         from ..coding.cftimeindex import CFTimeIndex
@@ -690,20 +691,35 @@ def resample(self, freq=None, dim=None, how=None, skipna=None,
                             "was passed %r" % dim)
 
         if isinstance(self.indexes[dim_name], CFTimeIndex):
-            raise NotImplementedError(
-                'Resample is currently not supported along a dimension '
-                'indexed by a CFTimeIndex.  For certain kinds of downsampling '
-                'it may be possible to work around this by converting your '
-                'time index to a DatetimeIndex using '
-                'CFTimeIndex.to_datetimeindex.  Use caution when doing this '
-                'however, because switching to a DatetimeIndex from a '
-                'CFTimeIndex with a non-standard calendar entails a change '
-                'in the calendar type, which could lead to subtle and silent '
-                'errors.'
-            )
+            from ..coding.cftime_offsets import to_offset
+            from .resample_cftime import (_get_time_bins, _offset_timedelta,
+                                          _adjust_binner_for_upsample)
+            offset = to_offset(freq)
+            times = self.indexes[dim_name]
+            binner, labels = _get_time_bins(self.indexes[dim_name],
+                                            offset,
+                                            closed, label, base)
+            if times.size > labels.size:
+                # if we're downsampling CFTimeIndex, do this:
+                if closed == 'right':
+                    fill_method = 'bfill'
+                else:
+                    fill_method = 'ffill'
+                binner = (pd.Series(binner, index=binner)
+                          .reindex(times, method=fill_method))
+                bin_actual = np.unique(binner.values)
+                label_dict = dict(zip(bin_actual, labels.values))
+                # np.unique returns --sorted-- unique values
+                binner = binner.map(label_dict)
+                grouper = ('downsampling', pd.Index(labels), binner)
+            else:
+                # if we're upsampling CFTimeIndex, do this:
+                binner = _adjust_binner_for_upsample(binner, closed)
+                grouper = ('upsampling', pd.Index(labels), binner, closed)
+        else:
+            grouper = pd.Grouper(freq=freq, closed=closed, label=label, base=base)
 
         group = DataArray(dim, [(dim.dims, dim)], name=RESAMPLE_DIM)
-        grouper = pd.Grouper(freq=freq, closed=closed, label=label, base=base)
         resampler = self._resample_cls(self, group=group, dim=dim_name,
                                        grouper=grouper,
                                        resample_dim=RESAMPLE_DIM)

diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -234,7 +234,29 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
                 # TODO: sort instead of raising an error
                 raise ValueError('index must be monotonic for resampling')
             s = pd.Series(np.arange(index.size), index)
-            first_items = s.groupby(grouper).first()
+            if isinstance(grouper, tuple):
+                if grouper[0] == 'downsampling':
+                    # if we're downsampling CFTimeIndex, do this:
+                    labels = grouper[1]
+                    binner = grouper[2]
+                    first_items = s.groupby(binner).first().reindex(labels)
+                    # reindex(grouper[1]) adds empty np.nan bins to
+                    # emulate pandas behavior
+                elif grouper[0] == 'upsampling':
+                    # if we're upsampling CFTimeIndex, do this:
+                    labels = grouper[1]
+                    binner = grouper[2]
+                    closed = grouper[3]
+                    if closed == 'right':
+                        first_items = s.reindex(pd.Index(binner),
+                                                method='nearest')
+                        first_items.index = labels
+                    else:
+                        first_items = s.reindex(pd.Index(binner),
+                                                method='bfill')
+                        first_items.index = labels
+            else:
+                first_items = s.groupby(grouper).first()
             full_index = first_items.index
             if first_items.isnull().any():
                 first_items = first_items.dropna()

diff --git a/xarray/core/resample.py b/xarray/core/resample.py
@@ -194,14 +194,35 @@ def _interpolate(self, kind='linear'):
                 .format(self._obj.data.name)
             )
 
-        x = self._obj[self._dim].astype('float')
+        from ..coding.cftimeindex import CFTimeIndex
+        import cftime as cf
+        import numpy as np
+        if isinstance(self._obj[self._dim].values[0], cf.datetime):
+            t = self._obj[self._dim]
+            x = np.insert([td.total_seconds() for td in
+                           t[1:].values - t[:-1].values], 0, 0).cumsum()
+            #  calling total_seconds is potentially bad for performance
+            x = x.round()
+            # Rounding fixes erroneous microsecond offsets in timedelta
+            # (fault of CFTime), but destroys microsecond resolution data
+        else:
+            x = self._obj[self._dim].astype('float')
         y = self._obj.data
 
         axis = self._obj.get_axis_num(self._dim)
 
         f = interp1d(x, y, kind=kind, axis=axis, bounds_error=True,
                      assume_sorted=True)
-        new_x = self._full_index.values.astype('float')
+        if isinstance(self._full_index.values[0], cf.datetime):
+            t = self._full_index
+            new_x = np.insert([td.total_seconds() for td in
+                               t[1:].values - t[:-1].values], 0, 0).cumsum()
+            #  calling total_seconds is potentially bad for performance
+            new_x = new_x.round()
+            # Rounding fixes erroneous microsecond offsets in timedelta
+            # (fault of CFTime), but destroys microsecond resolution data
+        else:
+            new_x = self._full_index.values.astype('float')
 
         # construct new up-sampled DataArray
         dummy = self._obj.copy()

diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py
@@ -0,0 +1,151 @@
+"""
+CFTimeIndex port of pandas resampling
+(pandas/pandas/core/resample.py)
+Does not support non-integer freq
+"""
+from __future__ import absolute_import, division, print_function
+
+import datetime
+from ..coding.cftimeindex import CFTimeIndex
+from ..coding.cftime_offsets import (cftime_range, normalize_date,
+                                     Day, Hour, Minute, Second)
+
+
+def _get_time_bins(index, freq, closed, label, base):
+    # This portion of code comes from TimeGrouper __init__ #
+    end_types = {'M', 'A'}
+    if freq._freq in end_types:
+        if closed is None:
+            closed = 'right'
+        if label is None:
+            label = 'right'
+    else:
+        if closed is None:
+            closed = 'left'
+        if label is None:
+            label = 'left'
+    # This portion of code comes from TimeGrouper __init__ #
+
+    if not isinstance(index, CFTimeIndex):
+        raise TypeError('index must be a CFTimeIndex, but got '
+                        'an instance of %r' % type(index).__name__)
+    if len(index) == 0:
+        binner = labels = CFTimeIndex(data=[], name=index.name)
+        return binner, [], labels
+
+    first, last = _get_range_edges(index.min(), index.max(), freq,
+                                   closed=closed,
+                                   base=base)
+    binner = labels = cftime_range(freq=freq,
+                                   start=first,
+                                   end=last,
+                                   name=index.name)
+
+    if len(binner) > 1 and binner[-1] < last:
+        extra_date_range = cftime_range(binner[-1], last + freq,
+                                        freq=freq, name=index.name)
+        binner = labels = CFTimeIndex(binner.append(extra_date_range[1:]))
+
+    trimmed = False
+    if len(binner) > 2 and binner[-2] == last and closed == 'right':
+        binner = binner[:-1]
+        trimmed = True
+
+    if closed == 'right':
+        labels = binner
+        if label == 'right':
+            labels = labels[1:]
+        elif not trimmed:
+            labels = labels[:-1]
+    else:
+        if label == 'right':
+            labels = labels[1:]
+        elif not trimmed:
+            labels = labels[:-1]
+    return binner, labels
+
+
+def _adjust_bin_edges(binner, ax_values, freq):
+    # Some hacks for > daily data, see #1471, #1458, #1483
+    if freq._freq not in ['D', 'H', 'T', 'min', 'S']:
+        # intraday values on last day
+        if binner[-2] > ax_values.max():
+            binner = binner[:-1]
+    return binner
+
+
+def _get_range_edges(first, last, offset, closed='left', base=0):
+    if offset._freq in ['D', 'H', 'T', 'min', 'S']:
+        is_day = isinstance(offset, Day)
+        if (is_day and offset.n == 1) or not is_day:
+            return _adjust_dates_anchored(first, last, offset,
+                                          closed=closed, base=base)
+    else:
+        first = normalize_date(first)
+        last = normalize_date(last)
+
+    if closed == 'left':
+        first = offset.rollback(first)
+    else:
+        first = first - offset
+
+    last = last + offset
+    return first, last
+
+
+def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
+    base = base % offset.n
+    start_day = normalize_date(first)
+    base_td = datetime.timedelta(0)
+    if offset._freq == 'D':
+        base_td = datetime.timedelta(days=base)
+    elif offset._freq == 'H':
+        base_td = datetime.timedelta(hours=base)
+    elif offset._freq in ['T', 'min']:
+        base_td = datetime.timedelta(minutes=base)
+    elif offset._freq == 'S':
+        base_td = datetime.timedelta(seconds=base)
+    offset_td = _offset_timedelta(offset)
+    start_day += base_td
+    foffset = (first - start_day) % offset_td
+    loffset = (last - start_day) % offset_td
+    if closed == 'right':
+        if foffset.total_seconds() > 0:
+            fresult = first - foffset
+        else:
+            fresult = first - offset_td
+
+        if loffset.total_seconds() > 0:
+            lresult = last + (offset_td - loffset)
+        else:
+            lresult = last
+    else:
+        if foffset.total_seconds() > 0:
+            fresult = first - foffset
+        else:
+            fresult = first
+
+        if loffset.total_seconds() > 0:
+            lresult = last + (offset_td - loffset)
+        else:
+            lresult = last + offset_td
+    return fresult, lresult
+
+
+def _offset_timedelta(offset):
+    if isinstance(offset, Day):
+        return datetime.timedelta(days=offset.n)
+    elif isinstance(offset, Hour):
+        return datetime.timedelta(hours=offset.n)
+    elif isinstance(offset, Minute):
+        return datetime.timedelta(minutes=offset.n)
+    elif isinstance(offset, Second):
+        return datetime.timedelta(seconds=offset.n)
+
+
+def _adjust_binner_for_upsample(binner, closed):
+    if closed == 'right':
+        binner = binner[1:]
+    else:
+        binner = binner[:-1]
+    return binner
diff --git a/xarray/tests/temp/cftime_resample_pandas_comparison.py b/xarray/tests/temp/cftime_resample_pandas_comparison.py
@@ -0,0 +1,118 @@
+import xarray as xr
+import pandas as pd
+import numpy as np
+
+
+# Equal sampling comparisons:
+ti = pd.date_range('2000-01-01', periods=9, freq='T', tz='UTC')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('T', label='left', closed='left').mean())
+print(ps.resample('T', label='right', closed='left').mean())
+print(ps.resample('T', label='left', closed='right').mean())
+print(ps.resample('T', label='right', closed='right').mean())
+print(ps.resample('60S', label='left', closed='left').mean())
+print(ps.resample('60S', label='right', closed='left').mean())
+print(ps.resample('60S', label='left', closed='right').mean())
+print(ps.resample('60S', label='right', closed='right').mean())
+ti = pd.date_range('2000', periods=30, freq='MS', tz='UTC')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('M', label='left', closed='left').max())
+print(ps.resample('M', label='right', closed='left').max())
+print(ps.resample('M', label='left', closed='right').max())
+print(ps.resample('M', label='right', closed='right').max())
+print(ps.resample('MS', label='left', closed='left').max())
+print(ps.resample('MS', label='right', closed='left').max())
+print(ps.resample('MS', label='left', closed='right').max())
+print(ps.resample('MS', label='right', closed='right').max())
+
+
+# Downsampling comparisons:
+ti = pd.date_range('2000-01-01', periods=9, freq='MS', tz='UTC')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('3M', label='left', closed='left').max())
+print(ps.resample('3M', label='right', closed='left').max())
+print(ps.resample('3M', label='left', closed='right').max())
+print(ps.resample('3M', label='right', closed='right').max())
+print(ps.resample('3MS', label='left', closed='left').max())
+print(ps.resample('3MS', label='right', closed='left').max())
+print(ps.resample('3MS', label='left', closed='right').max())
+print(ps.resample('3MS', label='right', closed='right').max())
+print(ps.resample('3M', label='left', closed='left').mean())
+print(ps.resample('3M', label='right', closed='left').mean())
+print(ps.resample('3M', label='left', closed='right').mean())
+print(ps.resample('3M', label='right', closed='right').mean())
+print(ps.resample('3MS', label='left', closed='left').mean())
+print(ps.resample('3MS', label='right', closed='left').mean())
+print(ps.resample('3MS', label='left', closed='right').mean())
+print(ps.resample('3MS', label='right', closed='right').mean())
+print(ps.resample('2M', label='left', closed='left').mean())
+print(ps.resample('2M', label='right', closed='left').mean())
+print(ps.resample('2M', label='left', closed='right').mean())
+print(ps.resample('2M', label='right', closed='right').mean())
+print(ps.resample('2MS', label='left', closed='left').mean())
+print(ps.resample('2MS', label='right', closed='left').mean())
+print(ps.resample('2MS', label='left', closed='right').mean())
+print(ps.resample('2MS', label='right', closed='right').mean())
+# Checking how label and closed args affect outputs
+ti = pd.date_range('2000-01-01', periods=9, freq='T', tz='UTC')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('3T', label='left', closed='left').mean())
+print(ps.resample('3T', label='right', closed='left').mean())
+print(ps.resample('3T', label='left', closed='right').mean())
+print(ps.resample('3T', label='right', closed='right').mean())
+ti = pd.date_range('2000', periods=30, freq='MS')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('6MS', label='left', closed='left').max())
+print(ps.resample('6MS', label='right', closed='left').max())
+print(ps.resample('6MS', label='left', closed='right').max())
+print(ps.resample('6MS', label='right', closed='right').max())
+# Checking different aggregation funcs, also checking cases when label and closed == None
+ti = pd.date_range('2000', periods=30, freq='MS')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('MS').mean())  # essentially doing no resampling, should return original data
+print(ps.resample('6MS').mean())
+print(ps.resample('6MS').asfreq())  # results do not match since xarray makes asfreq = mean (see resample.py)
+print(ps.resample('6MS').sum())
+print(ps.resample('6MS').min())
+print(ps.resample('6MS').max())
+
+
+# Upsampling comparisons:
+# At seconds-resolution, xr.cftime_range is 1 second off from pd.date_range
+ti = pd.date_range('2011-01-01T13:02:03', '2012-01-01T00:00:00', freq='D', tz='UTC')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('12T', base=0).interpolate().index) # testing T vs min
+print(ps.resample('12min', base=0).interpolate().index)
+print(ps.resample('12min', base=1).interpolate().index)
+print(ps.resample('12min', base=5).interpolate().index)
+print(ps.resample('12min', base=17).interpolate().index)
+print(ps.resample('12S', base=17).interpolate().index)
+print(ps.resample('1D', base=0).interpolate().values)  # essentially doing no resampling, should return original data
+print(ps.resample('1D', base=0).mean().values)  # essentially doing no resampling, should return original data
+# Pandas' upsampling behave aberrantly if start times for dates are not neat, should we replicate?
+ti = pd.date_range('2000-01-01T13:02:03', '2000-02-01T00:00:00', freq='D', tz='UTC')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)  # results unchanged if array of floats used
+print(ps.resample('8H', base=0, closed='left').interpolate().values)
+print(ps.resample('8H', base=0, closed='left').sum().values)
+print(ps.resample('8H', base=0, closed='left').mean().values)
+print(ps.resample('8H', base=0, closed='right').interpolate().values)
+print(ps.resample('8H', base=0, closed='right').sum().values)
+print(ps.resample('8H', base=0, closed='right').mean().values)
+# Neat start times (00:00:00) produces expected behavior when upsampling with pandas
+ti = pd.date_range('2000-01-01T00:00:00', '2000-02-01T00:00:00', freq='D', tz='UTC')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('12T', base=0).interpolate())
+print(ps.resample('12T', base=24, closed='left').interpolate())
+print(ps.resample('12T', base=24, closed='right', label='left').interpolate())
+ti = pd.date_range('2000', periods=30, freq='MS')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('D').interpolate())
+
+
+# Shows how resample-apply produces different results with Series and DataArray
+ti = pd.date_range('2000', periods=30, freq='MS')
+da = xr.DataArray(np.arange(100, 100+ti.size), [('time', ti)])
+print(da.resample(time='6MS').sum())
+ti = pd.date_range('2000', periods=30, freq='MS')
+ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
+print(ps.resample('6MS').sum())