From 6caa80405001a4ae3d77feb004205e20e75c4082 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 15 Jun 2023 10:42:33 -0400 Subject: [PATCH] Minor: Add more doc strings to WindowExpr (#6663) * Minor: Add more doc strings to WindowExpr * More doc improvements * add order by --- .../src/window/partition_evaluator.rs | 67 ++++++++++++++++--- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-expr/src/window/partition_evaluator.rs b/datafusion/physical-expr/src/window/partition_evaluator.rs index 553f631790b0..0dfad0e80f05 100644 --- a/datafusion/physical-expr/src/window/partition_evaluator.rs +++ b/datafusion/physical-expr/src/window/partition_evaluator.rs @@ -85,17 +85,19 @@ use std::ops::Range; /// previous batch. The previous row number is saved and restored as /// the state. /// -/// [`BuiltInWindowFunctionExpr`]: crate::window::BuiltInWindowFunctionExpr -/// [`BuiltInWindowFunctionExpr::create_evaluator`]: crate::window::BuiltInWindowFunctionExpr::create_evaluator /// When implementing a new `PartitionEvaluator`, /// `uses_window_frame` and `supports_bounded_execution` flags determine which evaluation method will be called /// during runtime. Implement corresponding evaluator according to table below. +/// /// |uses_window_frame|supports_bounded_execution|function_to_implement| /// |---|---|----| /// |false|false|`evaluate_all` (if we were to implement `PERCENT_RANK` it would end up in this quadrant, we cannot produce any result without seeing whole data)| /// |false|true|`evaluate` (optionally can also implement `evaluate_all` for more optimized implementation. However, there will be default implementation that is suboptimal) . If we were to implement `ROW_NUMBER` it will end up in this quadrant. Example `OddRowNumber` showcases this use case| /// |true|false|`evaluate` (I think as long as `uses_window_frame` is `true`. There is no way for `supports_bounded_execution` to be false). I couldn't come up with any example for this quadrant | /// |true|true|`evaluate`. If we were to implement `FIRST_VALUE`, it would end up in this quadrant|. +/// +/// [`BuiltInWindowFunctionExpr`]: crate::window::BuiltInWindowFunctionExpr +/// [`BuiltInWindowFunctionExpr::create_evaluator`]: crate::window::BuiltInWindowFunctionExpr::create_evaluator pub trait PartitionEvaluator: Debug + Send { /// Updates the internal state for window function /// @@ -144,11 +146,41 @@ pub trait PartitionEvaluator: Debug + Send { } } - /// Called for window functions that *do not use* values from the - /// the window frame, such as `ROW_NUMBER`, `RANK`, `DENSE_RANK`, - /// `PERCENT_RANK`, `CUME_DIST`, `LEAD`, `LAG`). It produces result - /// of all rows in a single pass. It expects to receive whole table data - /// as a single batch. + /// Evaluate a window function on an entire input partition. + /// + /// This function is called once per input *partition* for window + /// functions that *do not use* values from the window frame, + /// such as `ROW_NUMBER`, `RANK`, `DENSE_RANK`, `PERCENT_RANK`, + /// `CUME_DIST`, `LEAD`, `LAG`). + /// + /// It produces the result of all rows in a single pass. It + /// expects to receive the entire partition as the `value` and + /// must produce an output column with one output row for every + /// input row. + /// + /// `num_rows` is requied to correctly compute the output in case + /// `values.len() == 0` + /// + /// Using this function is an optimization: certain window + /// functions are not affected by the window frame definition, and + /// thus using `evaluate`, DataFusion can skip the (costly) window + /// frame boundary calculation. + /// + /// For example, the `LAG` built in window function does not use + /// the values of its window frame (it can be computed in one shot + /// on the entire partition with `Self::evaluate_all` regardless of the + /// window defined in the `OVER` clause) + /// + /// ```sql + /// lag(x, 1) OVER (ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) + /// ``` + /// + /// However, `avg()` computes the average in the window and thus + /// does use its window frame + /// + /// ```sql + /// avg(x) OVER (PARTITION BY y ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) + /// ``` fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result { // When window frame boundaries are not used and evaluator supports bounded execution // We can calculate evaluate result by repeatedly calling `self.evaluate` `num_rows` times @@ -166,9 +198,19 @@ pub trait PartitionEvaluator: Debug + Send { } } - /// Evaluate window function result inside given range. + /// Evaluate window function on a range of rows in an input + /// partition.x /// - /// Only used for stateful evaluation + /// Only used for stateful evaluation. + /// + /// This is the simplest and most general function to implement + /// but also the least performant as it creates output one row at + /// a time. It is typically much faster to implement stateful + /// evaluation using one of the other specialized methods on this + /// trait. + /// + /// Returns a [`ScalarValue`] that is the value of the window + /// function within the rangefor the entire partition fn evaluate( &mut self, _values: &[ArrayRef], @@ -224,9 +266,12 @@ pub trait PartitionEvaluator: Debug + Send { false } - /// Does the window function use the values from its window frame? + /// Does the window function use the values from the window frame, + /// if one is specified? /// - /// If this function returns true, implement [`PartitionEvaluator::evaluate`] + /// If this function returns true, implement [`PartitionEvaluator::evaluate_all`]. + /// + /// See details and examples on [`PartitionEvaluator::evaluate_all`]. fn uses_window_frame(&self) -> bool { false }