From 6e0797483f3b908f1a62646505ef9539e5652488 Mon Sep 17 00:00:00 2001 From: Colin Ho Date: Fri, 6 Sep 2024 16:16:14 -0700 Subject: [PATCH] improve agg docs --- daft/dataframe/dataframe.py | 4 + .../user_guide/daft_in_depth/aggregations.rst | 87 +++++++++++++++---- 2 files changed, 73 insertions(+), 18 deletions(-) diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py index 418b03bd97..57fb4dcbaa 100644 --- a/daft/dataframe/dataframe.py +++ b/daft/dataframe/dataframe.py @@ -2179,6 +2179,8 @@ def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": """Perform aggregations on this DataFrame. Allows for mixed aggregations for multiple columns Will return a single row that aggregated the entire DataFrame. + For a full list of aggregation expressions, see :ref:`Aggregation Expressions ` + Example: >>> import daft >>> from daft import col @@ -2834,6 +2836,8 @@ def agg_concat(self, *cols: ColumnInputType) -> "DataFrame": def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": """Perform aggregations on this GroupedDataFrame. Allows for mixed aggregations. + For a full list of aggregation expressions, see :ref:`Aggregation Expressions ` + Example: >>> import daft >>> from daft import col diff --git a/docs/source/user_guide/daft_in_depth/aggregations.rst b/docs/source/user_guide/daft_in_depth/aggregations.rst index e7944755de..3238fbb958 100644 --- a/docs/source/user_guide/daft_in_depth/aggregations.rst +++ b/docs/source/user_guide/daft_in_depth/aggregations.rst @@ -3,8 +3,6 @@ Aggregations and Grouping Some operations such as the sum or the average of a column are called **aggregations**. Aggregations are operations that reduce the number of rows in a column. -For a full list of available aggregations, see: :ref:`df-aggregations`. - Global Aggregations ------------------- @@ -23,13 +21,41 @@ An aggregation can be applied on an entire DataFrame, for example to get the mea .. code:: none - +-----------+ - | score | - | Float64 | - +===========+ - | 25 | - +-----------+ - (Showing first 1 rows) + ╭─────────╮ + │ score │ + │ --- │ + │ Float64 │ + ╞═════════╡ + │ 25 │ + ╰─────────╯ + + (Showing first 1 of 1 rows) + +For a full list of available Dataframe aggregations, see: :ref:`df-aggregations`. + +Aggregations can also be mixed and matched across columns, via the `agg` method: + +.. code:: python + + df.agg( + df["score"].mean().alias("mean_score"), + df["score"].max().alias("max_score"), + df["class"].count().alias("class_count"), + ).show() + +.. code:: none + + ╭────────────┬───────────┬─────────────╮ + │ mean_score ┆ max_score ┆ class_count │ + │ --- ┆ --- ┆ --- │ + │ Float64 ┆ Float64 ┆ UInt64 │ + ╞════════════╪═══════════╪═════════════╡ + │ 25 ┆ 40 ┆ 4 │ + ╰────────────┴───────────┴─────────────╯ + + (Showing first 1 of 1 rows) + +For a full list of available aggregation expressions, see: :ref:`Aggregation Expressions ` Grouped Aggregations -------------------- @@ -44,12 +70,37 @@ Let's run the mean of column "score" again, but this time grouped by "class": .. code:: none - +---------+-----------+ - | class | score | - | Utf8 | Float64 | - +=========+===========+ - | b | 35 | - +---------+-----------+ - | a | 15 | - +---------+-----------+ - (Showing first 2 rows) + ╭───────┬─────────╮ + │ class ┆ score │ + │ --- ┆ --- │ + │ Utf8 ┆ Float64 │ + ╞═══════╪═════════╡ + │ a ┆ 15 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ b ┆ 35 │ + ╰───────┴─────────╯ + + (Showing first 2 of 2 rows) + +To run multiple aggregations on a Grouped DataFrame, you can use the `agg` method: + +.. code:: python + + df.groupby("class").agg( + df["score"].mean().alias("mean_score"), + df["score"].max().alias("max_score"), + ).show() + +.. code:: none + + ╭───────┬────────────┬───────────╮ + │ class ┆ mean_score ┆ max_score │ + │ --- ┆ --- ┆ --- │ + │ Utf8 ┆ Float64 ┆ Float64 │ + ╞═══════╪════════════╪═══════════╡ + │ a ┆ 15 ┆ 20 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤ + │ b ┆ 35 ┆ 40 │ + ╰───────┴────────────┴───────────╯ + + (Showing first 2 of 2 rows)