Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 28 additions & 12 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,7 @@ def resample(self, freq=None, dim=None, how=None, skipna=None,
"""
# TODO support non-string indexer after removing the old API.

from ..coding.cftime_offsets import cftime_range
from .dataarray import DataArray
from .resample import RESAMPLE_DIM
from ..coding.cftimeindex import CFTimeIndex
Expand Down Expand Up @@ -690,20 +691,35 @@ def resample(self, freq=None, dim=None, how=None, skipna=None,
"was passed %r" % dim)

if isinstance(self.indexes[dim_name], CFTimeIndex):
raise NotImplementedError(
'Resample is currently not supported along a dimension '
'indexed by a CFTimeIndex. For certain kinds of downsampling '
'it may be possible to work around this by converting your '
'time index to a DatetimeIndex using '
'CFTimeIndex.to_datetimeindex. Use caution when doing this '
'however, because switching to a DatetimeIndex from a '
'CFTimeIndex with a non-standard calendar entails a change '
'in the calendar type, which could lead to subtle and silent '
'errors.'
)
from ..coding.cftime_offsets import to_offset
from .resample_cftime import (_get_time_bins, _offset_timedelta,
_adjust_binner_for_upsample)
offset = to_offset(freq)
times = self.indexes[dim_name]
binner, labels = _get_time_bins(self.indexes[dim_name],
offset,
closed, label, base)
if times.size > labels.size:
# if we're downsampling CFTimeIndex, do this:
if closed == 'right':
fill_method = 'bfill'
else:
fill_method = 'ffill'
binner = (pd.Series(binner, index=binner)
.reindex(times, method=fill_method))
bin_actual = np.unique(binner.values)
label_dict = dict(zip(bin_actual, labels.values))
# np.unique returns --sorted-- unique values
binner = binner.map(label_dict)
grouper = ('downsampling', pd.Index(labels), binner)
else:
# if we're upsampling CFTimeIndex, do this:
binner = _adjust_binner_for_upsample(binner, closed)
grouper = ('upsampling', pd.Index(labels), binner, closed)
else:
grouper = pd.Grouper(freq=freq, closed=closed, label=label, base=base)

group = DataArray(dim, [(dim.dims, dim)], name=RESAMPLE_DIM)
grouper = pd.Grouper(freq=freq, closed=closed, label=label, base=base)
resampler = self._resample_cls(self, group=group, dim=dim_name,
grouper=grouper,
resample_dim=RESAMPLE_DIM)
Expand Down
24 changes: 23 additions & 1 deletion xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,29 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
# TODO: sort instead of raising an error
raise ValueError('index must be monotonic for resampling')
s = pd.Series(np.arange(index.size), index)
first_items = s.groupby(grouper).first()
if isinstance(grouper, tuple):
if grouper[0] == 'downsampling':
# if we're downsampling CFTimeIndex, do this:
labels = grouper[1]
binner = grouper[2]
first_items = s.groupby(binner).first().reindex(labels)
# reindex(grouper[1]) adds empty np.nan bins to
# emulate pandas behavior
elif grouper[0] == 'upsampling':
# if we're upsampling CFTimeIndex, do this:
labels = grouper[1]
binner = grouper[2]
closed = grouper[3]
if closed == 'right':
first_items = s.reindex(pd.Index(binner),
method='nearest')
first_items.index = labels
else:
first_items = s.reindex(pd.Index(binner),
method='bfill')
first_items.index = labels
else:
first_items = s.groupby(grouper).first()
full_index = first_items.index
if first_items.isnull().any():
first_items = first_items.dropna()
Expand Down
25 changes: 23 additions & 2 deletions xarray/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,14 +194,35 @@ def _interpolate(self, kind='linear'):
.format(self._obj.data.name)
)

x = self._obj[self._dim].astype('float')
from ..coding.cftimeindex import CFTimeIndex
import cftime as cf
import numpy as np
if isinstance(self._obj[self._dim].values[0], cf.datetime):
t = self._obj[self._dim]
x = np.insert([td.total_seconds() for td in
t[1:].values - t[:-1].values], 0, 0).cumsum()
# calling total_seconds is potentially bad for performance
x = x.round()
# Rounding fixes erroneous microsecond offsets in timedelta
# (fault of CFTime), but destroys microsecond resolution data
else:
x = self._obj[self._dim].astype('float')
y = self._obj.data

axis = self._obj.get_axis_num(self._dim)

f = interp1d(x, y, kind=kind, axis=axis, bounds_error=True,
assume_sorted=True)
new_x = self._full_index.values.astype('float')
if isinstance(self._full_index.values[0], cf.datetime):
t = self._full_index
new_x = np.insert([td.total_seconds() for td in
t[1:].values - t[:-1].values], 0, 0).cumsum()
# calling total_seconds is potentially bad for performance
new_x = new_x.round()
# Rounding fixes erroneous microsecond offsets in timedelta
# (fault of CFTime), but destroys microsecond resolution data
else:
new_x = self._full_index.values.astype('float')

# construct new up-sampled DataArray
dummy = self._obj.copy()
Expand Down
151 changes: 151 additions & 0 deletions xarray/core/resample_cftime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
CFTimeIndex port of pandas resampling
(pandas/pandas/core/resample.py)
Does not support non-integer freq
"""
from __future__ import absolute_import, division, print_function

import datetime
from ..coding.cftimeindex import CFTimeIndex
from ..coding.cftime_offsets import (cftime_range, normalize_date,
Day, Hour, Minute, Second)


def _get_time_bins(index, freq, closed, label, base):
# This portion of code comes from TimeGrouper __init__ #
end_types = {'M', 'A'}
if freq._freq in end_types:
if closed is None:
closed = 'right'
if label is None:
label = 'right'
else:
if closed is None:
closed = 'left'
if label is None:
label = 'left'
# This portion of code comes from TimeGrouper __init__ #

if not isinstance(index, CFTimeIndex):
raise TypeError('index must be a CFTimeIndex, but got '
'an instance of %r' % type(index).__name__)
if len(index) == 0:
binner = labels = CFTimeIndex(data=[], name=index.name)
return binner, [], labels

first, last = _get_range_edges(index.min(), index.max(), freq,
closed=closed,
base=base)
binner = labels = cftime_range(freq=freq,
start=first,
end=last,
name=index.name)

if len(binner) > 1 and binner[-1] < last:
extra_date_range = cftime_range(binner[-1], last + freq,
freq=freq, name=index.name)
binner = labels = CFTimeIndex(binner.append(extra_date_range[1:]))

trimmed = False
if len(binner) > 2 and binner[-2] == last and closed == 'right':
binner = binner[:-1]
trimmed = True

if closed == 'right':
labels = binner
if label == 'right':
labels = labels[1:]
elif not trimmed:
labels = labels[:-1]
else:
if label == 'right':
labels = labels[1:]
elif not trimmed:
labels = labels[:-1]
return binner, labels


def _adjust_bin_edges(binner, ax_values, freq):
# Some hacks for > daily data, see #1471, #1458, #1483
if freq._freq not in ['D', 'H', 'T', 'min', 'S']:
# intraday values on last day
if binner[-2] > ax_values.max():
binner = binner[:-1]
return binner


def _get_range_edges(first, last, offset, closed='left', base=0):
if offset._freq in ['D', 'H', 'T', 'min', 'S']:
is_day = isinstance(offset, Day)
if (is_day and offset.n == 1) or not is_day:
return _adjust_dates_anchored(first, last, offset,
closed=closed, base=base)
else:
first = normalize_date(first)
last = normalize_date(last)

if closed == 'left':
first = offset.rollback(first)
else:
first = first - offset

last = last + offset
return first, last


def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
base = base % offset.n
start_day = normalize_date(first)
base_td = datetime.timedelta(0)
if offset._freq == 'D':
base_td = datetime.timedelta(days=base)
elif offset._freq == 'H':
base_td = datetime.timedelta(hours=base)
elif offset._freq in ['T', 'min']:
base_td = datetime.timedelta(minutes=base)
elif offset._freq == 'S':
base_td = datetime.timedelta(seconds=base)
offset_td = _offset_timedelta(offset)
start_day += base_td
foffset = (first - start_day) % offset_td
loffset = (last - start_day) % offset_td
if closed == 'right':
if foffset.total_seconds() > 0:
fresult = first - foffset
else:
fresult = first - offset_td

if loffset.total_seconds() > 0:
lresult = last + (offset_td - loffset)
else:
lresult = last
else:
if foffset.total_seconds() > 0:
fresult = first - foffset
else:
fresult = first

if loffset.total_seconds() > 0:
lresult = last + (offset_td - loffset)
else:
lresult = last + offset_td
return fresult, lresult


def _offset_timedelta(offset):
if isinstance(offset, Day):
return datetime.timedelta(days=offset.n)
elif isinstance(offset, Hour):
return datetime.timedelta(hours=offset.n)
elif isinstance(offset, Minute):
return datetime.timedelta(minutes=offset.n)
elif isinstance(offset, Second):
return datetime.timedelta(seconds=offset.n)


def _adjust_binner_for_upsample(binner, closed):
if closed == 'right':
binner = binner[1:]
else:
binner = binner[:-1]
return binner
118 changes: 118 additions & 0 deletions xarray/tests/temp/cftime_resample_pandas_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import xarray as xr
import pandas as pd
import numpy as np


# Equal sampling comparisons:
ti = pd.date_range('2000-01-01', periods=9, freq='T', tz='UTC')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('T', label='left', closed='left').mean())
print(ps.resample('T', label='right', closed='left').mean())
print(ps.resample('T', label='left', closed='right').mean())
print(ps.resample('T', label='right', closed='right').mean())
print(ps.resample('60S', label='left', closed='left').mean())
print(ps.resample('60S', label='right', closed='left').mean())
print(ps.resample('60S', label='left', closed='right').mean())
print(ps.resample('60S', label='right', closed='right').mean())
ti = pd.date_range('2000', periods=30, freq='MS', tz='UTC')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('M', label='left', closed='left').max())
print(ps.resample('M', label='right', closed='left').max())
print(ps.resample('M', label='left', closed='right').max())
print(ps.resample('M', label='right', closed='right').max())
print(ps.resample('MS', label='left', closed='left').max())
print(ps.resample('MS', label='right', closed='left').max())
print(ps.resample('MS', label='left', closed='right').max())
print(ps.resample('MS', label='right', closed='right').max())


# Downsampling comparisons:
ti = pd.date_range('2000-01-01', periods=9, freq='MS', tz='UTC')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('3M', label='left', closed='left').max())
print(ps.resample('3M', label='right', closed='left').max())
print(ps.resample('3M', label='left', closed='right').max())
print(ps.resample('3M', label='right', closed='right').max())
print(ps.resample('3MS', label='left', closed='left').max())
print(ps.resample('3MS', label='right', closed='left').max())
print(ps.resample('3MS', label='left', closed='right').max())
print(ps.resample('3MS', label='right', closed='right').max())
print(ps.resample('3M', label='left', closed='left').mean())
print(ps.resample('3M', label='right', closed='left').mean())
print(ps.resample('3M', label='left', closed='right').mean())
print(ps.resample('3M', label='right', closed='right').mean())
print(ps.resample('3MS', label='left', closed='left').mean())
print(ps.resample('3MS', label='right', closed='left').mean())
print(ps.resample('3MS', label='left', closed='right').mean())
print(ps.resample('3MS', label='right', closed='right').mean())
print(ps.resample('2M', label='left', closed='left').mean())
print(ps.resample('2M', label='right', closed='left').mean())
print(ps.resample('2M', label='left', closed='right').mean())
print(ps.resample('2M', label='right', closed='right').mean())
print(ps.resample('2MS', label='left', closed='left').mean())
print(ps.resample('2MS', label='right', closed='left').mean())
print(ps.resample('2MS', label='left', closed='right').mean())
print(ps.resample('2MS', label='right', closed='right').mean())
# Checking how label and closed args affect outputs
ti = pd.date_range('2000-01-01', periods=9, freq='T', tz='UTC')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('3T', label='left', closed='left').mean())
print(ps.resample('3T', label='right', closed='left').mean())
print(ps.resample('3T', label='left', closed='right').mean())
print(ps.resample('3T', label='right', closed='right').mean())
ti = pd.date_range('2000', periods=30, freq='MS')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('6MS', label='left', closed='left').max())
print(ps.resample('6MS', label='right', closed='left').max())
print(ps.resample('6MS', label='left', closed='right').max())
print(ps.resample('6MS', label='right', closed='right').max())
# Checking different aggregation funcs, also checking cases when label and closed == None
ti = pd.date_range('2000', periods=30, freq='MS')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('MS').mean()) # essentially doing no resampling, should return original data
print(ps.resample('6MS').mean())
print(ps.resample('6MS').asfreq()) # results do not match since xarray makes asfreq = mean (see resample.py)
print(ps.resample('6MS').sum())
print(ps.resample('6MS').min())
print(ps.resample('6MS').max())


# Upsampling comparisons:
# At seconds-resolution, xr.cftime_range is 1 second off from pd.date_range
ti = pd.date_range('2011-01-01T13:02:03', '2012-01-01T00:00:00', freq='D', tz='UTC')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('12T', base=0).interpolate().index) # testing T vs min
print(ps.resample('12min', base=0).interpolate().index)
print(ps.resample('12min', base=1).interpolate().index)
print(ps.resample('12min', base=5).interpolate().index)
print(ps.resample('12min', base=17).interpolate().index)
print(ps.resample('12S', base=17).interpolate().index)
print(ps.resample('1D', base=0).interpolate().values) # essentially doing no resampling, should return original data
print(ps.resample('1D', base=0).mean().values) # essentially doing no resampling, should return original data
# Pandas' upsampling behave aberrantly if start times for dates are not neat, should we replicate?
ti = pd.date_range('2000-01-01T13:02:03', '2000-02-01T00:00:00', freq='D', tz='UTC')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti) # results unchanged if array of floats used
print(ps.resample('8H', base=0, closed='left').interpolate().values)
print(ps.resample('8H', base=0, closed='left').sum().values)
print(ps.resample('8H', base=0, closed='left').mean().values)
print(ps.resample('8H', base=0, closed='right').interpolate().values)
print(ps.resample('8H', base=0, closed='right').sum().values)
print(ps.resample('8H', base=0, closed='right').mean().values)
# Neat start times (00:00:00) produces expected behavior when upsampling with pandas
ti = pd.date_range('2000-01-01T00:00:00', '2000-02-01T00:00:00', freq='D', tz='UTC')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('12T', base=0).interpolate())
print(ps.resample('12T', base=24, closed='left').interpolate())
print(ps.resample('12T', base=24, closed='right', label='left').interpolate())
ti = pd.date_range('2000', periods=30, freq='MS')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('D').interpolate())


# Shows how resample-apply produces different results with Series and DataArray
ti = pd.date_range('2000', periods=30, freq='MS')
da = xr.DataArray(np.arange(100, 100+ti.size), [('time', ti)])
print(da.resample(time='6MS').sum())
ti = pd.date_range('2000', periods=30, freq='MS')
ps = pd.Series(np.arange(100, 100+ti.size), index=ti)
print(ps.resample('6MS').sum())
Loading