From a0afe138d44743a43aa1a9c43602634e990d5339 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 27 Mar 2023 21:02:30 -0600 Subject: [PATCH 1/2] [skip-ci] Add compute to groupby benchmarks --- asv_bench/benchmarks/groupby.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 8124883b6a0..2da0f60b4ae 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,3 +1,5 @@ +# import flox to avoid the cost of first import +import flox.xarray # noqa import numpy as np import pandas as pd @@ -27,24 +29,24 @@ def time_init(self, ndim): @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) def time_agg_small_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") - getattr(ds.groupby("a"), method)() + getattr(ds.groupby("a"), method)().compute() @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") - getattr(ds.groupby("b"), method)() + getattr(ds.groupby("b"), method)().compute() def time_binary_op_1d(self): - self.ds1d.groupby("b") - self.ds1d_mean + (self.ds1d.groupby("b") - self.ds1d_mean).compute() def time_binary_op_2d(self): - self.ds2d.groupby("b") - self.ds2d_mean + (self.ds2d.groupby("b") - self.ds2d_mean).compute() def peakmem_binary_op_1d(self): - self.ds1d.groupby("b") - self.ds1d_mean + (self.ds1d.groupby("b") - self.ds1d_mean).compute() def peakmem_binary_op_2d(self): - self.ds2d.groupby("b") - self.ds2d_mean + (self.ds2d.groupby("b") - self.ds2d_mean).compute() class GroupByDask(GroupBy): @@ -56,8 +58,8 @@ def setup(self, *args, **kwargs): self.ds1d["c"] = self.ds1d["c"].chunk({"dim_0": 50}) self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)) self.ds2d["c"] = self.ds2d["c"].chunk({"dim_0": 50, "z": 5}) - self.ds1d_mean = self.ds1d.groupby("b").mean() - self.ds2d_mean = self.ds2d.groupby("b").mean() + self.ds1d_mean = self.ds1d.groupby("b").mean().compute() + self.ds2d_mean = self.ds2d.groupby("b").mean().compute() class GroupByPandasDataFrame(GroupBy): @@ -69,7 +71,7 @@ def setup(self, *args, **kwargs): super().setup(**kwargs) self.ds1d = self.ds1d.to_dataframe() - self.ds1d_mean = self.ds1d.groupby("b").mean() + self.ds1d_mean = self.ds1d.groupby("b").mean().compute() def time_binary_op_2d(self): raise NotImplementedError @@ -88,7 +90,7 @@ def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe() - self.ds1d_mean = self.ds1d.groupby("b").mean() + self.ds1d_mean = self.ds1d.groupby("b").mean().compute() def time_binary_op_2d(self): raise NotImplementedError @@ -116,12 +118,12 @@ def time_init(self, ndim): @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) def time_agg_small_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") - getattr(ds.resample(time="3M"), method)() + getattr(ds.resample(time="3M"), method)().compute() @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") - getattr(ds.resample(time="48H"), method)() + getattr(ds.resample(time="48H"), method)().compute() class ResampleDask(Resample): From e7eb7df9e73576b0e6414a9bff3d42e5c912b00a Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 27 Mar 2023 21:20:08 -0600 Subject: [PATCH 2/2] [skip-ci] Update asv_bench/benchmarks/groupby.py --- asv_bench/benchmarks/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 2da0f60b4ae..8cd23f3947c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -71,7 +71,7 @@ def setup(self, *args, **kwargs): super().setup(**kwargs) self.ds1d = self.ds1d.to_dataframe() - self.ds1d_mean = self.ds1d.groupby("b").mean().compute() + self.ds1d_mean = self.ds1d.groupby("b").mean() def time_binary_op_2d(self): raise NotImplementedError