apache · wjones127 · Jan 11, 2024 · Jan 6, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -153,6 +153,10 @@ macro_rules! config_namespace {
 
 config_namespace! {
     /// Options related to catalog and directory scanning
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct CatalogOptions {
         /// Whether the default catalog and schema should be created automatically.
         pub create_default_catalog_and_schema: bool, default = true
@@ -180,6 +184,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to SQL parser
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct SqlParserOptions {
         /// When set to true, SQL parser will parse float as decimal type
         pub parse_float_as_decimal: bool, default = false
@@ -196,6 +204,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to query execution
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ExecutionOptions {
         /// Default batch size while creating new batches, it's especially useful for
         /// buffer-in-memory batches since creating tiny batches would result in too much
@@ -283,6 +295,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to parquet files
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ParquetOptions {
         /// If true, reads the Parquet data page level metadata (the
         /// Page Index), if present, to reduce the I/O and number of
@@ -306,7 +322,7 @@ config_namespace! {
         pub metadata_size_hint: Option<usize>, default = None
 
         /// If true, filter expressions are be applied during the parquet decoding operation to
-        /// reduce the number of rows decoded
+        /// reduce the number of rows decoded. This optimization is sometimes called "late materialization".
         pub pushdown_filters: bool, default = false
 
         /// If true, filter expressions evaluated during the parquet decoding operation
@@ -416,6 +432,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to aggregate execution
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct AggregateOptions {
         /// Specifies the threshold for using `ScalarValue`s to update
         /// accumulators during high-cardinality aggregations for each input batch.
@@ -433,6 +453,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to query optimization
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct OptimizerOptions {
         /// When set to true, the optimizer will push a limit operation into
         /// grouped aggregations which have no aggregate expressions, as a soft limit,
@@ -541,6 +565,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options controlling explain output
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ExplainOptions {
         /// When set to true, the explain statement will only print logical plans
         pub logical_plan_only: bool, default = false

diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs
@@ -24,7 +24,69 @@ use std::{
 
 use datafusion_common::{config::ConfigOptions, Result, ScalarValue};
 
-/// Configuration options for Execution context
+/// Configuration options for [`SessionContext`].
+///
+/// Can be passed to [`SessionContext::new_with_config`] to customize the configuration of DataFusion.
+///
+/// Options can be set using namespaces keys with `.` as the separator, where the
+/// namespace determines which configuration struct the value to routed to. All
+/// built-in options are under the `datafusion` namespace.
+///
+/// For example, the key `datafusion.execution.batch_size` will set [ExecutionOptions::batch_size][datafusion_common::config::ExecutionOptions::batch_size],
+/// because [ConfigOptions::execution] is [ExecutionOptions][datafusion_common::config::ExecutionOptions]. Similarly, the key
+/// `datafusion.execution.parquet.pushdown_filters` will set [ParquetOptions::pushdown_filters][datafusion_common::config::ParquetOptions::pushdown_filters],
+/// since [ExecutionOptions::parquet][datafusion_common::config::ExecutionOptions::parquet] is [ParquetOptions][datafusion_common::config::ParquetOptions].
+///
+/// Some options have convenience methods. For example [SessionConfig::with_batch_size] is
+/// shorthand for setting `datafusion.execution.batch_size`.
+///
+/// ```
+/// use datafusion_execution::config::SessionConfig;
+/// use datafusion_common::ScalarValue;
+///
+/// let config = SessionConfig::new()
+///    .set("datafusion.execution.batch_size", ScalarValue::UInt64(Some(1234)))
+///    .set_bool("datafusion.execution.parquet.pushdown_filters", true);
+///
+/// assert_eq!(config.batch_size(), 1234);
+/// assert_eq!(config.options().execution.batch_size, 1234);
+/// assert_eq!(config.options().execution.parquet.pushdown_filters, true);
+/// ```
+///
+/// You can also directly mutate the options via [SessionConfig::options_mut].
+/// So the following is equivalent to the above:
+///
+/// ```
+/// # use datafusion_execution::config::SessionConfig;
+/// # use datafusion_common::ScalarValue;
+/// #
+/// let mut config = SessionConfig::new();
+/// config.options_mut().execution.batch_size = 1234;
+/// config.options_mut().execution.parquet.pushdown_filters = true;
+/// #
+/// # assert_eq!(config.batch_size(), 1234);
+/// # assert_eq!(config.options().execution.batch_size, 1234);
+/// # assert_eq!(config.options().execution.parquet.pushdown_filters, true);
+/// ```
+///
+/// ## Built-in options
+///
+/// | Namespace | Config struct |
+/// | --------- | ------------- |
+/// | `datafusion.catalog` | [CatalogOptions][datafusion_common::config::CatalogOptions] |
+/// | `datafusion.execution` | [ExecutionOptions][datafusion_common::config::ExecutionOptions] |
+/// | `datafusion.execution.aggregate` | [AggregateOptions][datafusion_common::config::AggregateOptions] |
+/// | `datafusion.execution.parquet` | [ParquetOptions][datafusion_common::config::ParquetOptions] |
+/// | `datafusion.optimizer` | [OptimizerOptions][datafusion_common::config::OptimizerOptions] |
+/// | `datafusion.sql_parser` | [SqlParserOptions][datafusion_common::config::SqlParserOptions] |
+/// | `datafusion.explain` | [ExplainOptions][datafusion_common::config::ExplainOptions] |
+///
+/// ## Custom configuration
+///
+/// Configuration options can be extended. See [SessionConfig::with_extension] for details.
+///
+/// [`SessionContext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html
+/// [`SessionContext::new_with_config`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.new_with_config
 #[derive(Clone, Debug)]
 pub struct SessionConfig {
     /// Configuration options
@@ -62,6 +124,35 @@ impl SessionConfig {
         Ok(ConfigOptions::from_string_hash_map(settings)?.into())
     }
 
+    /// Return a handle to the configuration options.
+    ///
+    /// Can be used to read the current configuration.
+    ///
+    /// ```
+    /// use datafusion_execution::config::SessionConfig;
+    ///
+    /// let config = SessionConfig::new();
+    /// assert!(config.options().execution.batch_size > 0);
+    /// ```
+    pub fn options(&self) -> &ConfigOptions {
+        &self.options
+    }
+
+    /// Return a mutable handle to the configuration options.
+    ///
+    /// Can be used to set configuration options.
+    ///
+    /// ```
+    /// use datafusion_execution::config::SessionConfig;
+    ///
+    /// let mut config = SessionConfig::new();
+    /// config.options_mut().execution.batch_size = 1024;
+    /// assert_eq!(config.options().execution.batch_size, 1024);
+    /// ```
+    pub fn options_mut(&mut self) -> &mut ConfigOptions {
+        &mut self.options
+    }
+
     /// Set a configuration option
     pub fn set(mut self, key: &str, value: ScalarValue) -> Self {
         self.options.set(key, &value.to_string()).unwrap();
@@ -346,16 +437,6 @@ impl SessionConfig {
         &mut self.options
     }
 
-    /// Return a handle to the configuration options.
-    pub fn options(&self) -> &ConfigOptions {
-        &self.options
-    }
-
-    /// Return a mutable handle to the configuration options.
-    pub fn options_mut(&mut self) -> &mut ConfigOptions {
-        &mut self.options
-    }
-
     /// Add extensions.
     ///
     /// Extensions can be used to attach extra data to the session config -- e.g. tracing information or caches.

diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -248,7 +248,7 @@ datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 By def
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
 datafusion.execution.parquet.metadata_size_hint NULL If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer
 datafusion.execution.parquet.pruning true If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
-datafusion.execution.parquet.pushdown_filters false If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded
+datafusion.execution.parquet.pushdown_filters false If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
 datafusion.execution.parquet.reorder_filters false If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
 datafusion.execution.parquet.skip_metadata true If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
 datafusion.execution.parquet.statistics_enabled NULL Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting

diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
@@ -53,7 +53,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
 | datafusion.execution.parquet.pruning                                    | true                      | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                                                                                                                                                                                                                                                                                                                         |
 | datafusion.execution.parquet.skip_metadata                              | true                      | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                                                                                                                                                                                                                                                                                                                       |
 | datafusion.execution.parquet.metadata_size_hint                         | NULL                      | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.parquet.pushdown_filters                           | false                     | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.execution.parquet.pushdown_filters                           | false                     | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | datafusion.execution.parquet.reorder_filters                            | false                     | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                                                                                                                                                                                                                                                                                                             |
 | datafusion.execution.parquet.data_pagesize_limit                        | 1048576                   | Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | datafusion.execution.parquet.write_batch_size                           | 1024                      | Sets write_batch_size in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |