Skip to content

Commit 20fddb7

Browse files
dcherianIllviljan
andauthored
Add groupby & resample benchmarks (#5922)
Co-authored-by: Illviljan <[email protected]>
1 parent eac78cc commit 20fddb7

File tree

1 file changed

+70
-11
lines changed

1 file changed

+70
-11
lines changed

asv_bench/benchmarks/groupby.py

+70-11
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,98 @@
11
import numpy as np
2+
import pandas as pd
23

34
import xarray as xr
45

5-
from . import parameterized, requires_dask
6+
from . import _skip_slow, parameterized, requires_dask
67

78

89
class GroupBy:
910
def setup(self, *args, **kwargs):
10-
self.ds = xr.Dataset(
11+
self.n = 100
12+
self.ds1d = xr.Dataset(
1113
{
12-
"a": xr.DataArray(np.r_[np.arange(500.0), np.arange(500.0)]),
13-
"b": xr.DataArray(np.arange(1000.0)),
14+
"a": xr.DataArray(np.r_[np.repeat(1, self.n), np.repeat(2, self.n)]),
15+
"b": xr.DataArray(np.arange(2 * self.n)),
1416
}
1517
)
18+
self.ds2d = self.ds1d.expand_dims(z=10)
1619

17-
@parameterized(["method"], [("sum", "mean")])
18-
def time_agg(self, method):
19-
return getattr(self.ds.groupby("a"), method)()
20+
@parameterized(["ndim"], [(1, 2)])
21+
def time_init(self, ndim):
22+
getattr(self, f"ds{ndim}d").groupby("b")
23+
24+
@parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
25+
def time_agg_small_num_groups(self, method, ndim):
26+
ds = getattr(self, f"ds{ndim}d")
27+
getattr(ds.groupby("a"), method)()
28+
29+
@parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
30+
def time_agg_large_num_groups(self, method, ndim):
31+
ds = getattr(self, f"ds{ndim}d")
32+
getattr(ds.groupby("b"), method)()
2033

2134

2235
class GroupByDask(GroupBy):
2336
def setup(self, *args, **kwargs):
2437
requires_dask()
2538
super().setup(**kwargs)
26-
self.ds = self.ds.chunk({"dim_0": 50})
39+
self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2)).chunk({"dim_0": 50})
40+
self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)).chunk(
41+
{"dim_0": 50, "z": 5}
42+
)
2743

2844

29-
class GroupByDataFrame(GroupBy):
45+
class GroupByPandasDataFrame(GroupBy):
46+
"""Run groupby tests using pandas DataFrame."""
47+
3048
def setup(self, *args, **kwargs):
49+
# Skip testing in CI as it won't ever change in a commit:
50+
_skip_slow()
51+
3152
super().setup(**kwargs)
32-
self.ds = self.ds.to_dataframe()
53+
self.ds1d = self.ds1d.to_dataframe()
3354

3455

3556
class GroupByDaskDataFrame(GroupBy):
57+
"""Run groupby tests using dask DataFrame."""
58+
59+
def setup(self, *args, **kwargs):
60+
# Skip testing in CI as it won't ever change in a commit:
61+
_skip_slow()
62+
63+
requires_dask()
64+
super().setup(**kwargs)
65+
self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()
66+
67+
68+
class Resample:
69+
def setup(self, *args, **kwargs):
70+
self.ds1d = xr.Dataset(
71+
{
72+
"b": ("time", np.arange(365.0 * 24)),
73+
},
74+
coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)},
75+
)
76+
self.ds2d = self.ds1d.expand_dims(z=10)
77+
78+
@parameterized(["ndim"], [(1, 2)])
79+
def time_init(self, ndim):
80+
getattr(self, f"ds{ndim}d").resample(time="D")
81+
82+
@parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
83+
def time_agg_small_num_groups(self, method, ndim):
84+
ds = getattr(self, f"ds{ndim}d")
85+
getattr(ds.resample(time="3M"), method)()
86+
87+
@parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
88+
def time_agg_large_num_groups(self, method, ndim):
89+
ds = getattr(self, f"ds{ndim}d")
90+
getattr(ds.resample(time="48H"), method)()
91+
92+
93+
class ResampleDask(Resample):
3694
def setup(self, *args, **kwargs):
3795
requires_dask()
3896
super().setup(**kwargs)
39-
self.ds = self.ds.chunk({"dim_0": 50}).to_dataframe()
97+
self.ds1d = self.ds1d.chunk({"time": 50})
98+
self.ds2d = self.ds2d.chunk({"time": 50, "z": 4})

0 commit comments

Comments
 (0)