From c052d6abc0dec8875c7875dee87c573edad8dea3 Mon Sep 17 00:00:00 2001
From: Bikramjeet Vig <bikramjeet@meta.com>
Date: Mon, 9 Sep 2024 14:24:29 -0700
Subject: [PATCH] Add query configs to turn off expression evaluation
 optimizations (#10902)

Summary:
Pull Request resolved: https://github.com/facebookincubator/velox/pull/10902

This change adds query configs to individually turn off expression
evaluation optimizations like dictionary peeling, dictionary
memoization, reusing shared subexpression results and deferring
lazy vector loading.
The goal is to streamline debugging in production and enable prompt
mitigation of bugs or regressions caused by the optimization or
surfaced due to it.

Note: When peeling is turned off, we still ensure that single arg
functions recieve a flat input

Reviewed By: mbasmanova

Differential Revision: D61943875
---
 velox/core/QueryConfig.h             |  37 ++++++
 velox/core/QueryCtx.h                |  75 +++++++++---
 velox/core/tests/QueryConfigTest.cpp |  62 +++++++++-
 velox/docs/configs.rst               |  16 +++
 velox/expression/EvalCtx.cpp         |  22 +---
 velox/expression/EvalCtx.h           |  29 +++--
 velox/expression/Expr.cpp            |  28 +++--
 velox/expression/Expr.h              |   5 +-
 velox/expression/tests/ExprTest.cpp  | 176 ++++++++++++++++++++++++++-
 9 files changed, 392 insertions(+), 58 deletions(-)
diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
index 814bdef03a2..def08a9de31 100644
--- a/velox/core/QueryConfig.h
+++ b/velox/core/QueryConfig.h
@@ -357,6 +357,43 @@ class QueryConfig {
   /// Empty string if only want to trace the query metadata.
   static constexpr const char* kQueryTraceNodeIds = "query_trace_node_ids";
 
+  /// Disable optimization in expression evaluation to peel common dictionary
+  /// layer from inputs.
+  static constexpr const char* kDebugDisableExpressionWithPeeling =
+      "debug_disable_expression_with_peeling";
+
+  /// Disable optimization in expression evaluation to re-use cached results for
+  /// common sub-expressions.
+  static constexpr const char* kDebugDisableCommonSubExpressions =
+      "debug_disable_common_sub_expressions";
+
+  /// Disable optimization in expression evaluation to re-use cached results
+  /// between subsequent input batches that are dictionary encoded and have the
+  /// same alphabet(underlying flat vector).
+  static constexpr const char* kDebugDisableExpressionWithMemoization =
+      "debug_disable_expression_with_memoization";
+
+  /// Disable optimization in expression evaluation to delay loading of lazy
+  /// inputs unless required.
+  static constexpr const char* kDebugDisableExpressionWithLazyInputs =
+      "debug_disable_expression_with_lazy_inputs";
+
+  bool debugDisableExpressionsWithPeeling() const {
+    return get<bool>(kDebugDisableExpressionWithPeeling, false);
+  }
+
+  bool debugDisableCommonSubExpressions() const {
+    return get<bool>(kDebugDisableCommonSubExpressions, false);
+  }
+
+  bool debugDisableExpressionsWithMemoization() const {
+    return get<bool>(kDebugDisableExpressionWithMemoization, false);
+  }
+
+  bool debugDisableExpressionsWithLazyInputs() const {
+    return get<bool>(kDebugDisableExpressionWithLazyInputs, false);
+  }
+
   uint64_t queryMaxMemoryPerNode() const {
     return config::toCapacity(
         get<std::string>(kQueryMaxMemoryPerNode, "0B"),
diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h
index c2274cf4af4..89ff229e3a4 100644
--- a/velox/core/QueryCtx.h
+++ b/velox/core/QueryCtx.h
@@ -229,12 +229,53 @@ class ExecCtx {
   ExecCtx(memory::MemoryPool* pool, QueryCtx* queryCtx)
       : pool_(pool),
         queryCtx_(queryCtx),
-        exprEvalCacheEnabled_(
-            !queryCtx ||
-            queryCtx->queryConfig().isExpressionEvaluationCacheEnabled()),
+        optimizationParams_(queryCtx),
         vectorPool_(
-            exprEvalCacheEnabled_ ? std::make_unique<VectorPool>(pool)
-                                  : nullptr) {}
+            optimizationParams_.exprEvalCacheEnabled
+                ? std::make_unique<VectorPool>(pool)
+                : nullptr) {}
+
+  struct OptimizationParams {
+    explicit OptimizationParams(QueryCtx* queryCtx) {
+      const core::QueryConfig defaultQueryConfig = core::QueryConfig({});
+
+      const core::QueryConfig& queryConfig =
+          queryCtx ? queryCtx->queryConfig() : defaultQueryConfig;
+
+      exprEvalCacheEnabled = queryConfig.isExpressionEvaluationCacheEnabled();
+      dictionaryMemoizationEnabled =
+          !queryConfig.debugDisableExpressionsWithMemoization() &&
+          exprEvalCacheEnabled;
+      peelingEnabled = !queryConfig.debugDisableExpressionsWithPeeling();
+      sharedSubExpressionReuseEnabled =
+          !queryConfig.debugDisableCommonSubExpressions();
+      deferredLazyLoadingEnabled =
+          !queryConfig.debugDisableExpressionsWithLazyInputs();
+      maxSharedSubexprResultsCached =
+          queryConfig.maxSharedSubexprResultsCached();
+    }
+
+    /// True if caches in expression evaluation used for performance are
+    /// enabled, including VectorPool, DecodedVectorPool, SelectivityVectorPool
+    /// and dictionary memoization.
+    bool exprEvalCacheEnabled;
+    /// True if dictionary memoization optimization is enabled during experssion
+    /// evaluation, whichallows the reuse of results between consecutive input
+    /// batches if they are dictionary encoded and have the same
+    /// alphabet(undelying flat vector).
+    bool dictionaryMemoizationEnabled;
+    /// True if peeling is enabled during experssion evaluation.
+    bool peelingEnabled;
+    /// True if shared subexpression reuse is enabled during experssion
+    /// evaluation.
+    bool sharedSubExpressionReuseEnabled;
+    /// True if loading lazy inputs are deferred till they need to be
+    /// accessed during experssion evaluation.
+    bool deferredLazyLoadingEnabled;
+    /// The maximum number of distinct inputs to cache results in a
+    /// given shared subexpression during experssion evaluation.
+    uint32_t maxSharedSubexprResultsCached;
+  };
 
   velox::memory::MemoryPool* pool() const {
     return pool_;
@@ -251,7 +292,9 @@ class ExecCtx {
   /// Prefer using LocalSelectivityVector which takes care of returning the
   /// vector to the pool on destruction.
   std::unique_ptr<SelectivityVector> getSelectivityVector(int32_t size) {
-    VELOX_CHECK(exprEvalCacheEnabled_ || selectivityVectorPool_.empty());
+    VELOX_CHECK(
+        optimizationParams_.exprEvalCacheEnabled ||
+        selectivityVectorPool_.empty());
     if (selectivityVectorPool_.empty()) {
       return std::make_unique<SelectivityVector>(size);
     }
@@ -265,7 +308,9 @@ class ExecCtx {
   // content. The caller is responsible for setting the size and
   // assigning the contents.
   std::unique_ptr<SelectivityVector> getSelectivityVector() {
-    VELOX_CHECK(exprEvalCacheEnabled_ || selectivityVectorPool_.empty());
+    VELOX_CHECK(
+        optimizationParams_.exprEvalCacheEnabled ||
+        selectivityVectorPool_.empty());
     if (selectivityVectorPool_.empty()) {
       return std::make_unique<SelectivityVector>();
     }
@@ -276,7 +321,7 @@ class ExecCtx {
 
   // Returns true if the vector was moved into the pool.
   bool releaseSelectivityVector(std::unique_ptr<SelectivityVector>&& vector) {
-    if (exprEvalCacheEnabled_) {
+    if (optimizationParams_.exprEvalCacheEnabled) {
       selectivityVectorPool_.push_back(std::move(vector));
       return true;
     }
@@ -284,7 +329,8 @@ class ExecCtx {
   }
 
   std::unique_ptr<DecodedVector> getDecodedVector() {
-    VELOX_CHECK(exprEvalCacheEnabled_ || decodedVectorPool_.empty());
+    VELOX_CHECK(
+        optimizationParams_.exprEvalCacheEnabled || decodedVectorPool_.empty());
     if (decodedVectorPool_.empty()) {
       return std::make_unique<DecodedVector>();
     }
@@ -295,7 +341,7 @@ class ExecCtx {
 
   // Returns true if the vector was moved into the pool.
   bool releaseDecodedVector(std::unique_ptr<DecodedVector>&& vector) {
-    if (exprEvalCacheEnabled_) {
+    if (optimizationParams_.exprEvalCacheEnabled) {
       decodedVectorPool_.push_back(std::move(vector));
       return true;
     }
@@ -334,8 +380,8 @@ class ExecCtx {
     return 0;
   }
 
-  bool exprEvalCacheEnabled() const {
-    return exprEvalCacheEnabled_;
+  const OptimizationParams& optimizationParams() const {
+    return optimizationParams_;
   }
 
  private:
@@ -343,8 +389,9 @@ class ExecCtx {
   memory::MemoryPool* const pool_;
   QueryCtx* const queryCtx_;
 
-  const bool exprEvalCacheEnabled_;
-  // A pool of preallocated DecodedVectors for use by expressions and operators.
+  const OptimizationParams optimizationParams_;
+  // A pool of preallocated DecodedVectors for use by expressions and
+  // operators.
   std::vector<std::unique_ptr<DecodedVector>> decodedVectorPool_;
   // A pool of preallocated SelectivityVectors for use by expressions
   // and operators.
diff --git a/velox/core/tests/QueryConfigTest.cpp b/velox/core/tests/QueryConfigTest.cpp
index 81a199685bc..c89d44d2fdf 100644
--- a/velox/core/tests/QueryConfigTest.cpp
+++ b/velox/core/tests/QueryConfigTest.cpp
@@ -118,12 +118,16 @@ TEST_F(QueryConfigTest, enableExpressionEvaluationCacheConfig) {
         enableExpressionEvaluationCache);
 
     auto execCtx = std::make_shared<core::ExecCtx>(pool.get(), queryCtx.get());
-    ASSERT_EQ(execCtx->exprEvalCacheEnabled(), enableExpressionEvaluationCache);
+    ASSERT_EQ(
+        execCtx->optimizationParams().exprEvalCacheEnabled,
+        enableExpressionEvaluationCache);
     ASSERT_EQ(
         execCtx->vectorPool() != nullptr, enableExpressionEvaluationCache);
 
     auto evalCtx = std::make_shared<exec::EvalCtx>(execCtx.get());
-    ASSERT_EQ(evalCtx->cacheEnabled(), enableExpressionEvaluationCache);
+    ASSERT_EQ(
+        evalCtx->dictionaryMemoizationEnabled(),
+        enableExpressionEvaluationCache);
 
     // Test ExecCtx::selectivityVectorPool_.
     auto rows = execCtx->getSelectivityVector(100);
@@ -144,4 +148,58 @@ TEST_F(QueryConfigTest, enableExpressionEvaluationCacheConfig) {
   testConfig(false);
 }
 
+TEST_F(QueryConfigTest, expressionEvaluationRelatedConfigs) {
+  // Verify that the expression evaluation related configs are porpogated
+  // correctly to ExprCtx which is used during expression evaluation. Each
+  // config is individually set and verified.
+  std::shared_ptr<memory::MemoryPool> rootPool{
+      memory::memoryManager()->addRootPool()};
+  std::shared_ptr<memory::MemoryPool> pool{rootPool->addLeafChild("leaf")};
+
+  auto testConfig =
+      [&](std::unordered_map<std::string, std::string> configData) {
+        auto queryCtx =
+            core::QueryCtx::create(nullptr, QueryConfig{std::move(configData)});
+        const auto& queryConfig = queryCtx->queryConfig();
+        auto execCtx =
+            std::make_shared<core::ExecCtx>(pool.get(), queryCtx.get());
+        auto evalCtx = std::make_shared<exec::EvalCtx>(execCtx.get());
+
+        ASSERT_EQ(
+            evalCtx->peelingEnabled(),
+            !queryConfig.debugDisableExpressionsWithPeeling());
+        ASSERT_EQ(
+            evalCtx->sharedSubExpressionReuseEnabled(),
+            !queryConfig.debugDisableCommonSubExpressions());
+        ASSERT_EQ(
+            evalCtx->dictionaryMemoizationEnabled(),
+            !queryConfig.debugDisableExpressionsWithMemoization());
+        ASSERT_EQ(
+            evalCtx->deferredLazyLoadingEnabled(),
+            !queryConfig.debugDisableExpressionsWithLazyInputs());
+      };
+
+  auto createConfig = [&](bool debugDisableExpressionsWithPeeling,
+                          bool debugDisableCommonSubExpressions,
+                          bool debugDisableExpressionsWithMemoization,
+                          bool debugDisableExpressionsWithLazyInputs) -> auto {
+    std::unordered_map<std::string, std::string> configData(
+        {{core::QueryConfig::kDebugDisableExpressionWithPeeling,
+          std::to_string(debugDisableExpressionsWithPeeling)},
+         {core::QueryConfig::kDebugDisableCommonSubExpressions,
+          std::to_string(debugDisableCommonSubExpressions)},
+         {core::QueryConfig::kDebugDisableExpressionWithMemoization,
+          std::to_string(debugDisableExpressionsWithMemoization)},
+         {core::QueryConfig::kDebugDisableExpressionWithLazyInputs,
+          std::to_string(debugDisableExpressionsWithLazyInputs)}});
+    return configData;
+  };
+
+  testConfig({}); // Verify default config.
+  testConfig(createConfig(true, false, false, false));
+  testConfig(createConfig(false, true, false, false));
+  testConfig(createConfig(false, false, true, false));
+  testConfig(createConfig(false, false, false, true));
+}
+
 } // namespace facebook::velox::core::test
diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst
index 746ad0d8cd4..bdcd28d054a 100644
--- a/velox/docs/configs.rst
+++ b/velox/docs/configs.rst
@@ -168,6 +168,22 @@ Expression Evaluation Configuration
      - bool
      - false
      - This flag makes the Row conversion to by applied in a way that the casting row field are matched by name instead of position.
+   * - debug_disable_expression_with_peeling
+     - bool
+     - false
+     - Disable optimization in expression evaluation to peel common dictionary layer from inputs. Should only be used for debugging.
+   * - debug_disable_common_sub_expressions
+     - bool
+     - false
+     - Disable optimization in expression evaluation to re-use cached results for common sub-expressions. Should only be used for debugging.
+   * - debug_disable_expression_with_memoization
+     - bool
+     - false
+     - Disable optimization in expression evaluation to re-use cached results between subsequent input batches that are dictionary encoded and have the same alphabet(underlying flat vector). Should only be used for debugging.
+   * - debug_disable_expression_with_lazy_inputs
+     - bool
+     - false
+     - Disable optimization in expression evaluation to delay loading of lazy inputs unless required. Should only be used for debugging.
 
 Memory Management
 -----------------
diff --git a/velox/expression/EvalCtx.cpp b/velox/expression/EvalCtx.cpp
index e43f0aae343..ae87fb28063 100644
--- a/velox/expression/EvalCtx.cpp
+++ b/velox/expression/EvalCtx.cpp
@@ -26,16 +26,7 @@ using facebook::velox::common::testutil::TestValue;
 namespace facebook::velox::exec {
 
 EvalCtx::EvalCtx(core::ExecCtx* execCtx, ExprSet* exprSet, const RowVector* row)
-    : execCtx_(execCtx),
-      exprSet_(exprSet),
-      row_(row),
-      cacheEnabled_(execCtx->exprEvalCacheEnabled()),
-      maxSharedSubexprResultsCached_(
-          execCtx->queryCtx()
-              ? execCtx->queryCtx()
-                    ->queryConfig()
-                    .maxSharedSubexprResultsCached()
-              : core::QueryConfig({}).maxSharedSubexprResultsCached()) {
+    : execCtx_(execCtx), exprSet_(exprSet), row_(row) {
   // TODO Change the API to replace raw pointers with non-const references.
   // Sanity check inputs to prevent crashes.
   VELOX_CHECK_NOT_NULL(execCtx);
@@ -53,16 +44,7 @@ EvalCtx::EvalCtx(core::ExecCtx* execCtx, ExprSet* exprSet, const RowVector* row)
 }
 
 EvalCtx::EvalCtx(core::ExecCtx* execCtx)
-    : execCtx_(execCtx),
-      exprSet_(nullptr),
-      row_(nullptr),
-      cacheEnabled_(execCtx->exprEvalCacheEnabled()),
-      maxSharedSubexprResultsCached_(
-          execCtx->queryCtx()
-              ? execCtx->queryCtx()
-                    ->queryConfig()
-                    .maxSharedSubexprResultsCached()
-              : core::QueryConfig({}).maxSharedSubexprResultsCached()) {
+    : execCtx_(execCtx), exprSet_(nullptr), row_(nullptr) {
   VELOX_CHECK_NOT_NULL(execCtx);
 }
 
diff --git a/velox/expression/EvalCtx.h b/velox/expression/EvalCtx.h
index 0d4ba9c655f..22fb13e478b 100644
--- a/velox/expression/EvalCtx.h
+++ b/velox/expression/EvalCtx.h
@@ -519,16 +519,33 @@ class EvalCtx {
     return peeledEncoding_.get();
   }
 
-  /// Returns true if caching in expression evaluation is enabled, such as
-  /// Expr::evalWithMemo.
-  bool cacheEnabled() const {
-    return cacheEnabled_;
+  /// Returns true if dictionary memoization optimization is enabled, which
+  /// allows the reuse of results between consecutive input batches if they are
+  /// dictionary encoded and have the same alphabet(undelying flat vector).
+  bool dictionaryMemoizationEnabled() const {
+    return execCtx_->optimizationParams().dictionaryMemoizationEnabled;
   }
 
   /// Returns the maximum number of distinct inputs to cache results for in a
   /// given shared subexpression.
   uint32_t maxSharedSubexprResultsCached() const {
-    return maxSharedSubexprResultsCached_;
+    return execCtx_->optimizationParams().maxSharedSubexprResultsCached;
+  }
+
+  /// Returns true if peeling is enabled.
+  bool peelingEnabled() const {
+    return execCtx_->optimizationParams().peelingEnabled;
+  }
+
+  /// Returns true if shared subexpression reuse is enabled.
+  bool sharedSubExpressionReuseEnabled() const {
+    return execCtx_->optimizationParams().sharedSubExpressionReuseEnabled;
+  }
+
+  /// Returns true if loading lazy inputs are deferred till they need to be
+  /// accessed.
+  bool deferredLazyLoadingEnabled() const {
+    return execCtx_->optimizationParams().deferredLazyLoadingEnabled;
   }
 
  private:
@@ -550,8 +567,6 @@ class EvalCtx {
   core::ExecCtx* const execCtx_;
   ExprSet* const exprSet_;
   const RowVector* row_;
-  const bool cacheEnabled_;
-  const uint32_t maxSharedSubexprResultsCached_;
   bool inputFlatNoNulls_;
 
   // Corresponds 1:1 to children of 'row_'. Set to an inner vector
diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp
index 8be2572bd7a..ee06286dfa4 100644
--- a/velox/expression/Expr.cpp
+++ b/velox/expression/Expr.cpp
@@ -708,7 +708,7 @@ void Expr::evalFlatNoNulls(
     EvalCtx& context,
     VectorPtr& result,
     const ExprSet* parentExprSet) {
-  if (shouldEvaluateSharedSubexp()) {
+  if (shouldEvaluateSharedSubexp(context)) {
     evaluateSharedSubexpr(
         rows,
         context,
@@ -819,7 +819,8 @@ void Expr::eval(
   //
   // TODO: Re-work the logic of deciding when to load which field.
   if (!hasConditionals_ || distinctFields_.size() == 1 ||
-      shouldEvaluateSharedSubexp()) {
+      shouldEvaluateSharedSubexp(context) ||
+      !context.deferredLazyLoadingEnabled()) {
     // Load lazy vectors if any.
     for (auto* field : distinctFields_) {
       context.ensureFieldLoaded(field->index(context), rows);
@@ -874,10 +875,8 @@ void Expr::evaluateSharedSubexpr(
   }
 
   if (sharedSubexprResultsIter == sharedSubexprResults_.end()) {
-    auto maxSharedSubexprResultsCached = context.execCtx()
-                                             ->queryCtx()
-                                             ->queryConfig()
-                                             .maxSharedSubexprResultsCached();
+    auto maxSharedSubexprResultsCached =
+        context.maxSharedSubexprResultsCached();
     if (sharedSubexprResults_.size() < maxSharedSubexprResultsCached) {
       // If we have room left in the cache, add it.
       sharedSubexprResultsIter =
@@ -1039,7 +1038,7 @@ Expr::PeelEncodingsResult Expr::peelEncodings(
 
   // If the expression depends on one dictionary, results are cacheable.
   bool mayCache = false;
-  if (context.cacheEnabled()) {
+  if (context.dictionaryMemoizationEnabled()) {
     mayCache = distinctFields_.size() == 1 &&
         VectorEncoding::isDictionary(context.wrapEncoding()) &&
         !peeledVectors[0]->memoDisabled();
@@ -1054,7 +1053,8 @@ void Expr::evalEncodings(
     const SelectivityVector& rows,
     EvalCtx& context,
     VectorPtr& result) {
-  if (deterministic_ && !skipFieldDependentOptimizations()) {
+  if (deterministic_ && !skipFieldDependentOptimizations() &&
+      context.peelingEnabled()) {
     bool hasFlat = false;
     for (auto* field : distinctFields_) {
       if (isFlat(*context.getField(field->index(context)))) {
@@ -1381,7 +1381,7 @@ void Expr::evalAll(
     return;
   }
 
-  if (shouldEvaluateSharedSubexp()) {
+  if (shouldEvaluateSharedSubexp(context)) {
     evaluateSharedSubexpr(
         rows,
         context,
@@ -1462,6 +1462,16 @@ bool Expr::applyFunctionWithPeeling(
     VectorPtr& result) {
   LocalDecodedVector localDecoded(context);
   LocalSelectivityVector newRowsHolder(context);
+  if (!context.peelingEnabled()) {
+    if (inputValues_.size() == 1) {
+      // If we have a single input, velox needs to ensure that the
+      // vectorFunction would receive a flat input.
+      BaseVector::flattenVector(inputValues_[0]);
+      applyFunction(applyRows, context, result);
+      return true;
+    }
+    return false;
+  }
   // Attempt peeling.
   std::vector<VectorPtr> peeledVectors;
   auto peeledEncoding = PeeledEncoding::peel(
diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h
index b425fd392bc..ade47d61f8b 100644
--- a/velox/expression/Expr.h
+++ b/velox/expression/Expr.h
@@ -486,8 +486,9 @@ class Expr {
   /// Evaluation of such expression is optimized by memoizing and reusing
   /// the results of prior evaluations. That logic is implemented in
   /// 'evaluateSharedSubexpr'.
-  bool shouldEvaluateSharedSubexp() const {
-    return deterministic_ && isMultiplyReferenced_ && !inputs_.empty();
+  bool shouldEvaluateSharedSubexp(EvalCtx& context) const {
+    return deterministic_ && isMultiplyReferenced_ && !inputs_.empty() &&
+        context.sharedSubExpressionReuseEnabled();
   }
 
   /// Evaluate common sub-expression. Check if sharedSubexprValues_ already has
diff --git a/velox/expression/tests/ExprTest.cpp b/velox/expression/tests/ExprTest.cpp
index d3af7b405d0..100d0acb71b 100644
--- a/velox/expression/tests/ExprTest.cpp
+++ b/velox/expression/tests/ExprTest.cpp
@@ -166,10 +166,12 @@ class ExprTest : public testing::Test, public VectorTestBase {
   evaluateMultipleWithStats(
       const std::vector<std::string>& texts,
       const RowVectorPtr& input,
-      std::vector<VectorPtr> resultToReuse = {}) {
+      std::vector<VectorPtr> resultToReuse = {},
+      core::ExecCtx* execCtx = nullptr) {
     auto exprSet = compileMultiple(texts, asRowType(input->type()));
 
-    exec::EvalCtx context(execCtx_.get(), exprSet.get(), input.get());
+    exec::EvalCtx context(
+        execCtx ? execCtx : execCtx_.get(), exprSet.get(), input.get());
 
     SelectivityVector rows(input->size());
     if (resultToReuse.empty()) {
@@ -190,11 +192,15 @@ class ExprTest : public testing::Test, public VectorTestBase {
   }
 
   std::pair<VectorPtr, std::unordered_map<std::string, exec::ExprStats>>
-  evaluateWithStats(exec::ExprSet* exprSetPtr, const RowVectorPtr& input) {
+  evaluateWithStats(
+      exec::ExprSet* exprSetPtr,
+      const RowVectorPtr& input,
+      core::ExecCtx* execCtx = nullptr) {
     SelectivityVector rows(input->size());
     std::vector<VectorPtr> results(1);
 
-    exec::EvalCtx context(execCtx_.get(), exprSetPtr, input.get());
+    exec::EvalCtx context(
+        execCtx ? execCtx : execCtx_.get(), exprSetPtr, input.get());
     exprSetPtr->eval(rows, context, results);
 
     return {results[0], exprSetPtr->stats()};
@@ -4795,5 +4801,167 @@ VELOX_INSTANTIATE_TEST_SUITE_P(
     ParameterizedExprTest,
     testing::ValuesIn({false, true}));
 
+TEST_F(ExprTest, disablePeeling) {
+  // Verify that peeling is disabled when the config is set by checking whether
+  // the number of rows processed is equal to the alphabet size (when enabled)
+  // or the dictionary size (when disabled).
+  // Also, ensure that single arg function recieves a flat vector even when
+  // peeling is disabled.
+
+  // This throws if input is not flat or constant.
+  VELOX_REGISTER_VECTOR_FUNCTION(
+      udf_testing_single_arg_deterministic, "testing_single_arg_deterministic");
+  // This wraps the input in a dictionary.
+  exec::registerVectorFunction(
+      "dict_wrap",
+      WrapInDictionaryFunc::signatures(),
+      std::make_unique<WrapInDictionaryFunc>(),
+      exec::VectorFunctionMetadataBuilder().defaultNullBehavior(false).build());
+  const std::vector<std::string> expressions = {"c0 + 1"};
+
+  auto flatInput = makeFlatVector<int64_t>({1, 2, 3});
+  auto flatSize = flatInput->size();
+  auto dictInput = wrapInDictionary(
+      makeIndices(2 * flatSize, [&](auto row) { return row % flatSize; }),
+      2 * flatSize,
+      flatInput);
+  auto dictSize = dictInput->size();
+
+  // Peeling Enabled (by default)
+  auto [result, stats] = evaluateMultipleWithStats(
+      expressions, makeRowVector({dictInput}), {}, execCtx_.get());
+
+  ASSERT_TRUE(stats.find("plus") != stats.end());
+  ASSERT_EQ(stats["plus"].numProcessedRows, flatSize);
+
+  // Peeling Disabled
+  std::unordered_map<std::string, std::string> configData(
+      {{core::QueryConfig::kDebugDisableExpressionWithPeeling, "true"}});
+  auto queryCtx = velox::core::QueryCtx::create(
+      nullptr, core::QueryConfig(std::move(configData)));
+  auto execCtx = std::make_unique<core::ExecCtx>(pool_.get(), queryCtx.get());
+
+  std::tie(result, stats) = evaluateMultipleWithStats(
+      expressions, makeRowVector({dictInput}), {}, execCtx.get());
+
+  ASSERT_TRUE(stats.find("plus") != stats.end());
+  ASSERT_EQ(stats["plus"].numProcessedRows, dictSize);
+
+  // Ensure single arg function recieves a flat vector.
+  // When top level column is dictionary wrapped.
+  ASSERT_NO_THROW(evaluateMultiple(
+      {"testing_single_arg_deterministic((c0))"},
+      makeRowVector({dictInput}),
+      {},
+      execCtx.get()));
+  // When intermediate column is dictionary wrapped.
+  // dict_wrap helps generate an intermediate dictionary vector.
+  ASSERT_NO_THROW(evaluateMultiple(
+      {"testing_single_arg_deterministic(dict_wrap(c0))"},
+      makeRowVector({flatInput}),
+      {},
+      execCtx.get()));
+}
+
+TEST_F(ExprTest, disableSharedSubExpressionReuse) {
+  // Verify that shared subexpression reuse is disabled when the config is set
+  // by confirming that the same rows are processed twice by the shared
+  // expression when its disabled.
+  const std::vector<std::string> expressions = {"c0 + 1", "(c0 + 1) = 0"};
+
+  auto flatInput = makeFlatVector<int64_t>({1, 2, 3});
+  auto flatSize = flatInput->size();
+
+  // SharedSubExpressionReuse Enabled (by default)
+  auto [result, stats] = evaluateMultipleWithStats(
+      expressions, makeRowVector({flatInput}), {}, execCtx_.get());
+
+  ASSERT_TRUE(stats.find("plus") != stats.end());
+  ASSERT_EQ(stats["plus"].numProcessedRows, flatSize);
+
+  // SharedSubExpressionReuse Disabled
+  std::unordered_map<std::string, std::string> configData(
+      {{core::QueryConfig::kDebugDisableCommonSubExpressions, "true"}});
+  auto queryCtx = velox::core::QueryCtx::create(
+      nullptr, core::QueryConfig(std::move(configData)));
+  auto execCtx = std::make_unique<core::ExecCtx>(pool_.get(), queryCtx.get());
+
+  std::tie(result, stats) = evaluateMultipleWithStats(
+      expressions, makeRowVector({flatInput}), {}, execCtx.get());
+
+  ASSERT_TRUE(stats.find("plus") != stats.end());
+  ASSERT_EQ(stats["plus"].numProcessedRows, 2 * flatSize);
+}
+
+TEST_F(ExprTest, disableMemoization) {
+  // Verify that memoization is disabled when the config is set by confirming
+  // that the third invocation reuses the results.
+  auto flatInput = makeFlatVector<int64_t>({1, 2, 3});
+  auto flatSize = flatInput->size();
+  auto dictInput = wrapInDictionary(
+      makeIndices(2 * flatSize, [&](auto row) { return row % flatSize; }),
+      2 * flatSize,
+      flatInput);
+  auto dictSize = dictInput->size();
+  auto inputRow = makeRowVector({dictInput});
+
+  auto exprSet = compileExpression("c0 + 1", asRowType(inputRow->type()));
+  // Memoization Enabled (by default). We need to evaluate the expression
+  // atleast twice to enable memoization. The third invocation will use the
+  // memoized result.
+  evaluateWithStats(exprSet.get(), inputRow, execCtx_.get());
+  evaluateWithStats(exprSet.get(), inputRow, execCtx_.get());
+  auto [result, stats] =
+      evaluateWithStats(exprSet.get(), inputRow, execCtx_.get());
+
+  ASSERT_TRUE(stats.find("plus") != stats.end());
+  ASSERT_EQ(stats["plus"].numProcessedRows, 2 * flatSize);
+
+  // Memoization Disabled
+  std::unordered_map<std::string, std::string> configData(
+      {{core::QueryConfig::kDebugDisableExpressionWithMemoization, "true"}});
+  auto queryCtx = velox::core::QueryCtx::create(
+      nullptr, core::QueryConfig(std::move(configData)));
+  auto execCtx = std::make_unique<core::ExecCtx>(pool_.get(), queryCtx.get());
+
+  exprSet = compileExpression("c0 + 1", asRowType(inputRow->type()));
+  evaluateWithStats(exprSet.get(), inputRow, execCtx.get());
+  evaluateWithStats(exprSet.get(), inputRow, execCtx.get());
+  std::tie(result, stats) =
+      evaluateWithStats(exprSet.get(), inputRow, execCtx.get());
+
+  ASSERT_TRUE(stats.find("plus") != stats.end());
+  ASSERT_EQ(stats["plus"].numProcessedRows, 3 * flatSize);
+}
+
+TEST_F(ExprTest, disabledeferredLazyLoading) {
+  // Verify that deferred lazy loading is disabled when the config is set by
+  // confirming that all rows are loaded even when only a subset is required.
+
+  // The following expression only requires 1 row to be loaded on c1.
+  const std::vector<std::string> expressions = {"(c0 < 2) AND (c1 > 0)"};
+  auto c0 = makeFlatVector<int64_t>({1, 2, 3});
+  auto valueAt = [](auto row) { return row; };
+  // Confirm only 1 row is loaded.
+  auto c1 = makeLazyFlatVector<int64_t>(3, valueAt, nullptr, 1);
+
+  // Deferred lazy loading enabled (by default). Confirm that only required rows
+  // are loaded.
+  auto [result, stats] = evaluateMultipleWithStats(
+      expressions, makeRowVector({c0, c1}), {}, execCtx_.get());
+
+  // Deferred lazy loading disabled. Confirm all rows will be loaded.
+  std::unordered_map<std::string, std::string> configData(
+      {{core::QueryConfig::kDebugDisableExpressionWithLazyInputs, "true"}});
+  auto queryCtx = velox::core::QueryCtx::create(
+      nullptr, core::QueryConfig(std::move(configData)));
+  auto execCtx = std::make_unique<core::ExecCtx>(pool_.get(), queryCtx.get());
+
+  // Confirm that all rows are loaded.
+  c1 = makeLazyFlatVector<int64_t>(3, valueAt, nullptr, 3);
+  std::tie(result, stats) = evaluateMultipleWithStats(
+      expressions, makeRowVector({c0, c1}), {}, execCtx.get());
+}
+
 } // namespace
 } // namespace facebook::velox::test